diff --git a/agent/agent_init.py b/agent/agent_init.py new file mode 100644 index 00000000000..df8fe229e7b --- /dev/null +++ b/agent/agent_init.py @@ -0,0 +1,1469 @@ +"""Implementation of :meth:`AIAgent.__init__` — extracted as a module function. + +``AIAgent.__init__`` is one of the longest methods in the codebase (60+ +parameters, ~1,400 lines of attribute initialization, provider +auto-detection, credential resolution, context-engine bootstrap, etc.). +Keeping it in ``run_agent.py`` bloats that file with code that's mostly +"setup state, then forget". + +After this extraction the body lives here as ``init_agent(agent, ...)`` +and :meth:`AIAgent.__init__` is a thin wrapper that calls +``init_agent(self, ...)``. All imports the body needs at module-load +time are listed below; the body also performs many lazy imports inside +its own scope that come along unchanged. + +Symbols that tests patch on ``run_agent.*`` (``OpenAI``, ``cleanup_vm``, +etc.) are resolved through :func:`_ra` so the patch contract is +preserved. +""" + +from __future__ import annotations + +import logging +import os +import re +import sys +import threading +import time +import uuid +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional +from urllib.parse import urlparse, parse_qs, urlunparse + +from agent.context_compressor import ContextCompressor +from agent.iteration_budget import IterationBudget +from agent.memory_manager import StreamingContextScrubber +from agent.model_metadata import ( + MINIMUM_CONTEXT_LENGTH, + fetch_model_metadata, + get_model_context_length, + is_local_endpoint, + query_ollama_num_ctx, +) +from agent.process_bootstrap import _install_safe_stdio +from agent.subdirectory_hints import SubdirectoryHintTracker +from agent.think_scrubber import StreamingThinkScrubber +from agent.tool_guardrails import ( + ToolCallGuardrailConfig, + ToolCallGuardrailController, + ToolGuardrailDecision, +) +from hermes_cli.config import cfg_get +from hermes_cli.timeouts import get_provider_request_timeout +from hermes_constants import get_hermes_home +from model_tools import check_toolset_requirements, get_tool_definitions +from utils import base_url_host_matches + +# Use the same logger name as run_agent so tests patching ``run_agent.logger`` +# capture our warnings. (run_agent.py also does +# ``logger = logging.getLogger(__name__)``, which resolves to "run_agent" +# from inside that module.) +logger = logging.getLogger("run_agent") + + +def _ra(): + """Lazy reference to ``run_agent`` so callers can patch + ``run_agent.OpenAI`` / ``run_agent.cleanup_vm`` / ... and have those + patches reach this code path. + """ + import run_agent + return run_agent + + +def init_agent( + agent, + base_url: str = None, + api_key: str = None, + provider: str = None, + api_mode: str = None, + acp_command: str = None, + acp_args: list[str] | None = None, + command: str = None, + args: list[str] | None = None, + model: str = "", + max_iterations: int = 90, # Default tool-calling iterations (shared with subagents) + tool_delay: float = 1.0, + enabled_toolsets: List[str] = None, + disabled_toolsets: List[str] = None, + save_trajectories: bool = False, + verbose_logging: bool = False, + quiet_mode: bool = False, + ephemeral_system_prompt: str = None, + log_prefix_chars: int = 100, + log_prefix: str = "", + providers_allowed: List[str] = None, + providers_ignored: List[str] = None, + providers_order: List[str] = None, + provider_sort: str = None, + provider_require_parameters: bool = False, + provider_data_collection: str = None, + openrouter_min_coding_score: Optional[float] = None, + session_id: str = None, + tool_progress_callback: callable = None, + tool_start_callback: callable = None, + tool_complete_callback: callable = None, + thinking_callback: callable = None, + reasoning_callback: callable = None, + clarify_callback: callable = None, + step_callback: callable = None, + stream_delta_callback: callable = None, + interim_assistant_callback: callable = None, + tool_gen_callback: callable = None, + status_callback: callable = None, + max_tokens: int = None, + reasoning_config: Dict[str, Any] = None, + service_tier: str = None, + request_overrides: Dict[str, Any] = None, + prefill_messages: List[Dict[str, Any]] = None, + platform: str = None, + user_id: str = None, + user_name: str = None, + chat_id: str = None, + chat_name: str = None, + chat_type: str = None, + thread_id: str = None, + gateway_session_key: str = None, + skip_context_files: bool = False, + load_soul_identity: bool = False, + skip_memory: bool = False, + session_db=None, + parent_session_id: str = None, + iteration_budget: "IterationBudget" = None, + fallback_model: Dict[str, Any] = None, + credential_pool=None, + checkpoints_enabled: bool = False, + checkpoint_max_snapshots: int = 20, + checkpoint_max_total_size_mb: int = 500, + checkpoint_max_file_size_mb: int = 10, + pass_session_id: bool = False, +): + """ + Initialize the AI Agent. + + Args: + base_url (str): Base URL for the model API (optional) + api_key (str): API key for authentication (optional, uses env var if not provided) + provider (str): Provider identifier (optional; used for telemetry/routing hints) + api_mode (str): API mode override: "chat_completions" or "codex_responses" + model (str): Model name to use (default: "anthropic/claude-opus-4.6") + max_iterations (int): Maximum number of tool calling iterations (default: 90) + tool_delay (float): Delay between tool calls in seconds (default: 1.0) + enabled_toolsets (List[str]): Only enable tools from these toolsets (optional) + disabled_toolsets (List[str]): Disable tools from these toolsets (optional) + save_trajectories (bool): Whether to save conversation trajectories to JSONL files (default: False) + verbose_logging (bool): Enable verbose logging for debugging (default: False) + quiet_mode (bool): Suppress progress output for clean CLI experience (default: False) + ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional) + log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 100) + log_prefix (str): Prefix to add to all log messages for identification in parallel processing (default: "") + providers_allowed (List[str]): OpenRouter providers to allow (optional) + providers_ignored (List[str]): OpenRouter providers to ignore (optional) + providers_order (List[str]): OpenRouter providers to try in order (optional) + provider_sort (str): Sort providers by price/throughput/latency (optional) + openrouter_min_coding_score (float): Coding-score floor (0.0-1.0) for the + openrouter/pareto-code router. Only applied when model == "openrouter/pareto-code". + None or empty = let OpenRouter pick the strongest available coder. + session_id (str): Pre-generated session ID for logging (optional, auto-generated if not provided) + tool_progress_callback (callable): Callback function(tool_name, args_preview) for progress notifications + clarify_callback (callable): Callback function(question, choices) -> str for interactive user questions. + Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error. + max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set) + reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking). + If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning. + prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context. + Useful for injecting a few-shot example or priming the model's response style. + Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}] + NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a conversation that ends on an + assistant-role message (400 error). For those models use structured outputs or + output_config.format instead of a trailing-assistant prefill. + platform (str): The interface platform the user is on (e.g. "cli", "telegram", "discord", "whatsapp"). + Used to inject platform-specific formatting hints into the system prompt. + skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules + into the system prompt. Use this for batch processing and data generation to avoid + polluting trajectories with user-specific persona or project instructions. + load_soul_identity (bool): If True, still use ~/.hermes/SOUL.md as the primary + identity even when skip_context_files=True. Project context files from the cwd + remain skipped. + """ + _install_safe_stdio() + + agent.model = model + agent.max_iterations = max_iterations + # Shared iteration budget — parent creates, children inherit. + # Consumed by every LLM turn across parent + all subagents. + agent.iteration_budget = iteration_budget or IterationBudget(max_iterations) + agent.tool_delay = tool_delay + agent.save_trajectories = save_trajectories + agent.verbose_logging = verbose_logging + agent.quiet_mode = quiet_mode + agent.ephemeral_system_prompt = ephemeral_system_prompt + agent.platform = platform # "cli", "telegram", "discord", "whatsapp", etc. + agent._user_id = user_id # Platform user identifier (gateway sessions) + agent._user_name = user_name + agent._chat_id = chat_id + agent._chat_name = chat_name + agent._chat_type = chat_type + agent._thread_id = thread_id + agent._gateway_session_key = gateway_session_key # Stable per-chat key (e.g. agent:main:telegram:dm:123) + # Pluggable print function — CLI replaces this with _cprint so that + # raw ANSI status lines are routed through prompt_toolkit's renderer + # instead of going directly to stdout where patch_stdout's StdoutProxy + # would mangle the escape sequences. None = use builtins.print. + agent._print_fn = None + agent.background_review_callback = None # Optional sync callback for gateway delivery + agent.skip_context_files = skip_context_files + agent.load_soul_identity = load_soul_identity + agent.pass_session_id = pass_session_id + agent._credential_pool = credential_pool + agent.log_prefix_chars = log_prefix_chars + agent.log_prefix = f"{log_prefix} " if log_prefix else "" + # Store effective base URL for feature detection (prompt caching, reasoning, etc.) + agent.base_url = base_url or "" + provider_name = provider.strip().lower() if isinstance(provider, str) and provider.strip() else None + agent.provider = provider_name or "" + agent.acp_command = acp_command or command + agent.acp_args = list(acp_args or args or []) + if api_mode in {"chat_completions", "codex_responses", "anthropic_messages", "bedrock_converse", "codex_app_server"}: + agent.api_mode = api_mode + elif agent.provider == "openai-codex": + agent.api_mode = "codex_responses" + elif agent.provider in {"xai", "xai-oauth"}: + agent.api_mode = "codex_responses" + elif (provider_name is None) and ( + agent._base_url_hostname == "chatgpt.com" + and "/backend-api/codex" in agent._base_url_lower + ): + agent.api_mode = "codex_responses" + agent.provider = "openai-codex" + elif (provider_name is None) and agent._base_url_hostname == "api.x.ai": + agent.api_mode = "codex_responses" + agent.provider = "xai" + elif agent.provider == "anthropic" or (provider_name is None and agent._base_url_hostname == "api.anthropic.com"): + agent.api_mode = "anthropic_messages" + agent.provider = "anthropic" + elif agent._base_url_lower.rstrip("/").endswith("/anthropic"): + # Third-party Anthropic-compatible endpoints (e.g. MiniMax, DashScope) + # use a URL convention ending in /anthropic. Auto-detect these so the + # Anthropic Messages API adapter is used instead of chat completions. + agent.api_mode = "anthropic_messages" + elif agent.provider == "bedrock" or ( + agent._base_url_hostname.startswith("bedrock-runtime.") + and base_url_host_matches(agent._base_url_lower, "amazonaws.com") + ): + # AWS Bedrock — auto-detect from provider name or base URL + # (bedrock-runtime..amazonaws.com). + agent.api_mode = "bedrock_converse" + else: + agent.api_mode = "chat_completions" + + # Eagerly warm the transport cache so import errors surface at init, + # not mid-conversation. Also validates the api_mode is registered. + try: + agent._get_transport() + except Exception: + pass # Non-fatal — transport may not exist for all modes yet + + try: + from hermes_cli.model_normalize import ( + _AGGREGATOR_PROVIDERS, + normalize_model_for_provider, + ) + + if agent.provider not in _AGGREGATOR_PROVIDERS: + agent.model = normalize_model_for_provider(agent.model, agent.provider) + except Exception: + pass + + # GPT-5.x models usually require the Responses API path, but some + # providers have exceptions (for example Copilot's gpt-5-mini still + # uses chat completions). Also auto-upgrade for direct OpenAI URLs + # (api.openai.com) since all newer tool-calling models prefer + # Responses there. ACP runtimes are excluded: CopilotACPClient + # handles its own routing and does not implement the Responses API + # surface. + # When api_mode was explicitly provided, respect it — the user + # knows what their endpoint supports (#10473). + # Exception: Azure OpenAI serves gpt-5.x on /chat/completions and + # does NOT support the Responses API — skip the upgrade for Azure + # (openai.azure.com), even though it looks OpenAI-compatible. + if ( + api_mode is None + and agent.api_mode == "chat_completions" + and agent.provider != "copilot-acp" + and not str(agent.base_url or "").lower().startswith("acp://copilot") + and not str(agent.base_url or "").lower().startswith("acp+tcp://") + and not agent._is_azure_openai_url() + and ( + agent._is_direct_openai_url() + or agent._provider_model_requires_responses_api( + agent.model, + provider=agent.provider, + ) + ) + ): + agent.api_mode = "codex_responses" + # Invalidate the eager-warmed transport cache — api_mode changed + # from chat_completions to codex_responses after the warm at __init__. + if hasattr(agent, "_transport_cache"): + agent._transport_cache.clear() + + # Pre-warm OpenRouter model metadata cache in a background thread. + # fetch_model_metadata() is cached for 1 hour; this avoids a blocking + # HTTP request on the first API response when pricing is estimated. + # Use a process-level Event so this thread is only spawned once — a new + # AIAgent is created for every gateway request, so without the guard + # each message leaks one OS thread and the process eventually exhausts + # the system thread limit (RuntimeError: can't start new thread). + if (agent.provider == "openrouter" or agent._is_openrouter_url()) and \ + not _ra()._openrouter_prewarm_done.is_set(): + _ra()._openrouter_prewarm_done.set() + threading.Thread( + target=fetch_model_metadata, + daemon=True, + name="openrouter-prewarm", + ).start() + + agent.tool_progress_callback = tool_progress_callback + agent.tool_start_callback = tool_start_callback + agent.tool_complete_callback = tool_complete_callback + agent.suppress_status_output = False + agent.thinking_callback = thinking_callback + agent.reasoning_callback = reasoning_callback + agent.clarify_callback = clarify_callback + agent.step_callback = step_callback + agent.stream_delta_callback = stream_delta_callback + agent.interim_assistant_callback = interim_assistant_callback + agent.status_callback = status_callback + agent.tool_gen_callback = tool_gen_callback + + + # Tool execution state — allows _vprint during tool execution + # even when stream consumers are registered (no tokens streaming then) + agent._executing_tools = False + agent._tool_guardrails = ToolCallGuardrailController() + agent._tool_guardrail_halt_decision: ToolGuardrailDecision | None = None + + # Interrupt mechanism for breaking out of tool loops + agent._interrupt_requested = False + agent._interrupt_message = None # Optional message that triggered interrupt + agent._execution_thread_id: int | None = None # Set at run_conversation() start + agent._interrupt_thread_signal_pending = False + agent._client_lock = threading.RLock() + + # /steer mechanism — inject a user note into the next tool result + # without interrupting the agent. Unlike interrupt(), steer() does + # NOT set _interrupt_requested; it waits for the current tool batch + # to finish naturally, then the drain hook appends the text to the + # last tool result's content so the model sees it on its next + # iteration. Message-role alternation is preserved (we modify an + # existing tool message rather than inserting a new user turn). + agent._pending_steer: Optional[str] = None + agent._pending_steer_lock = threading.Lock() + + # Concurrent-tool worker thread tracking. `_execute_tool_calls_concurrent` + # runs each tool on its own ThreadPoolExecutor worker — those worker + # threads have tids distinct from `_execution_thread_id`, so + # `_set_interrupt(True, _execution_thread_id)` alone does NOT cause + # `is_interrupted()` inside the worker to return True. Track the + # workers here so `interrupt()` / `clear_interrupt()` can fan out to + # their tids explicitly. + agent._tool_worker_threads: set[int] = set() + agent._tool_worker_threads_lock = threading.Lock() + + # Subagent delegation state + agent._delegate_depth = 0 # 0 = top-level agent, incremented for children + agent._active_children = [] # Running child AIAgents (for interrupt propagation) + agent._active_children_lock = threading.Lock() + + # Store OpenRouter provider preferences + agent.providers_allowed = providers_allowed + agent.providers_ignored = providers_ignored + agent.providers_order = providers_order + agent.provider_sort = provider_sort + agent.provider_require_parameters = provider_require_parameters + agent.provider_data_collection = provider_data_collection + agent.openrouter_min_coding_score = openrouter_min_coding_score + + # Store toolset filtering options + agent.enabled_toolsets = enabled_toolsets + agent.disabled_toolsets = disabled_toolsets + + # Model response configuration + agent.max_tokens = max_tokens # None = use model default + agent.reasoning_config = reasoning_config # None = use default (medium for OpenRouter) + agent.service_tier = service_tier + agent.request_overrides = dict(request_overrides or {}) + agent.prefill_messages = prefill_messages or [] # Prefilled conversation turns + agent._force_ascii_payload = False + + # Anthropic prompt caching: auto-enabled for Claude models on native + # Anthropic, OpenRouter, and third-party gateways that speak the + # Anthropic protocol (``api_mode == 'anthropic_messages'``). Reduces + # input costs by ~75% on multi-turn conversations. Uses system_and_3 + # strategy (4 breakpoints). See ``_anthropic_prompt_cache_policy`` + # for the layout-vs-transport decision. + agent._use_prompt_caching, agent._use_native_cache_layout = ( + agent._anthropic_prompt_cache_policy() + ) + # Anthropic supports "5m" (default) and "1h" cache TTL tiers. Read from + # config.yaml under prompt_caching.cache_ttl; unknown values keep "5m". + # 1h tier costs 2x on write vs 1.25x for 5m, but amortizes across long + # sessions with >5-minute pauses between turns (#14971). + agent._cache_ttl = "5m" + try: + from hermes_cli.config import load_config as _load_pc_cfg + + _pc_cfg = _load_pc_cfg().get("prompt_caching", {}) or {} + _ttl = _pc_cfg.get("cache_ttl", "5m") + if _ttl in {"5m", "1h"}: + agent._cache_ttl = _ttl + except Exception: + pass + + # Iteration budget: the LLM is only notified when it actually exhausts + # the iteration budget (api_call_count >= max_iterations). At that + # point we inject ONE message, allow one final API call, and if the + # model doesn't produce a text response, force a user-message asking + # it to summarise. No intermediate pressure warnings — they caused + # models to "give up" prematurely on complex tasks (#7915). + agent._budget_exhausted_injected = False + agent._budget_grace_call = False + + # Activity tracking — updated on each API call, tool execution, and + # stream chunk. Used by the gateway timeout handler to report what the + # agent was doing when it was killed, and by the "still working" + # notifications to show progress. + agent._last_activity_ts: float = time.time() + agent._last_activity_desc: str = "initializing" + agent._current_tool: str | None = None + agent._api_call_count: int = 0 + + # Rate limit tracking — updated from x-ratelimit-* response headers + # after each API call. Accessed by /usage slash command. + agent._rate_limit_state: Optional["RateLimitState"] = None + + # OpenRouter response cache hit counter — incremented when + # X-OpenRouter-Cache-Status: HIT is seen in streaming response headers. + agent._or_cache_hits: int = 0 + + # Centralized logging — agent.log (INFO+) and errors.log (WARNING+) + # both live under ~/.hermes/logs/. Idempotent, so gateway mode + # (which creates a new AIAgent per message) won't duplicate handlers. + from hermes_logging import setup_logging, setup_verbose_logging + setup_logging(hermes_home=_ra()._hermes_home) + + if agent.verbose_logging: + setup_verbose_logging() + _ra().logger.info("Verbose logging enabled (third-party library logs suppressed)") + elif agent.quiet_mode: + # In quiet mode (CLI default), keep console output clean — + # but DO NOT raise per-logger levels. Doing so prevents the + # root logger's file handlers (agent.log, errors.log) from + # ever seeing the records, because Python checks + # logger.isEnabledFor() before handler propagation. We rely + # on the fact that hermes_logging.setup_logging() does not + # install a console StreamHandler in quiet mode — so INFO + # records flow to the file handlers but never reach a + # console. Any future noise reduction belongs at the + # handler level inside hermes_logging.py, not here. + pass + + # Internal stream callback (set during streaming TTS). + # Initialized here so _vprint can reference it before run_conversation. + agent._stream_callback = None + # Deferred paragraph break flag — set after tool iterations so a + # single "\n\n" is prepended to the next real text delta. + agent._stream_needs_break = False + # Stateful scrubber for spans split across stream + # deltas (#5719). sanitize_context() alone can't survive chunk + # boundaries because the block regex needs both tags in one string. + agent._stream_context_scrubber = StreamingContextScrubber() + # Stateful scrubber for reasoning/thinking tags in streamed deltas + # (#17924). Replaces the per-delta _strip_think_blocks regex that + # destroyed downstream state (e.g. MiniMax-M2.7 streaming + # '' as delta1 and 'Let me check' as delta2 — the regex + # erased delta1, so downstream state machines never learned a + # block was open and leaked delta2 as content). + agent._stream_think_scrubber = StreamingThinkScrubber() + # Visible assistant text already delivered through live token callbacks + # during the current model response. Used to avoid re-sending the same + # commentary when the provider later returns it as a completed interim + # assistant message. + agent._current_streamed_assistant_text = "" + + # Optional current-turn user-message override used when the API-facing + # user message intentionally differs from the persisted transcript + # (e.g. CLI voice mode adds a temporary prefix for the live call only). + agent._persist_user_message_idx = None + agent._persist_user_message_override = None + + # Cache anthropic image-to-text fallbacks per image payload/URL so a + # single tool loop does not repeatedly re-run auxiliary vision on the + # same image history. + agent._anthropic_image_fallback_cache: Dict[str, str] = {} + + # Initialize LLM client via centralized provider router. + # The router handles auth resolution, base URL, headers, and + # Codex/Anthropic wrapping for all known providers. + # raw_codex=True because the main agent needs direct responses.stream() + # access for Codex Responses API streaming. + agent._anthropic_client = None + agent._is_anthropic_oauth = False + + # Resolve per-provider / per-model request timeout once up front so + # every client construction path below (Anthropic native, OpenAI-wire, + # router-based implicit auth) can apply it consistently. Bedrock + # Claude uses its own timeout path and is not covered here. + _provider_timeout = get_provider_request_timeout(agent.provider, agent.model) + + if agent.api_mode == "anthropic_messages": + from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token + # Bedrock + Claude → use AnthropicBedrock SDK for full feature parity + # (prompt caching, thinking budgets, adaptive thinking). + _is_bedrock_anthropic = agent.provider == "bedrock" + if _is_bedrock_anthropic: + from agent.anthropic_adapter import build_anthropic_bedrock_client + _region_match = re.search(r"bedrock-runtime\.([a-z0-9-]+)\.", base_url or "") + _br_region = _region_match.group(1) if _region_match else "us-east-1" + agent._bedrock_region = _br_region + agent._anthropic_client = build_anthropic_bedrock_client(_br_region) + agent._anthropic_api_key = "aws-sdk" + agent._anthropic_base_url = base_url + agent._is_anthropic_oauth = False + agent.api_key = "aws-sdk" + agent.client = None + agent._client_kwargs = {} + if not agent.quiet_mode: + print(f"🤖 AI Agent initialized with model: {agent.model} (AWS Bedrock + AnthropicBedrock SDK, {_br_region})") + else: + # Only fall back to ANTHROPIC_TOKEN when the provider is actually Anthropic. + # Other anthropic_messages providers (MiniMax, Alibaba, etc.) must use their own API key. + # Falling back would send Anthropic credentials to third-party endpoints (Fixes #1739, #minimax-401). + _is_native_anthropic = agent.provider == "anthropic" + effective_key = (api_key or resolve_anthropic_token() or "") if _is_native_anthropic else (api_key or "") + agent.api_key = effective_key + agent._anthropic_api_key = effective_key + agent._anthropic_base_url = base_url + # Only mark the session as OAuth-authenticated when the token + # genuinely belongs to native Anthropic. Third-party providers + # (MiniMax, Kimi, GLM, LiteLLM proxies) that accept the + # Anthropic protocol must never trip OAuth code paths — doing + # so injects Claude-Code identity headers and system prompts + # that cause 401/403 on their endpoints. Guards #1739 and + # the third-party identity-injection bug. + from agent.anthropic_adapter import _is_oauth_token as _is_oat + agent._is_anthropic_oauth = _is_oat(effective_key) if _is_native_anthropic else False + agent._anthropic_client = build_anthropic_client(effective_key, base_url, timeout=_provider_timeout) + # No OpenAI client needed for Anthropic mode + agent.client = None + agent._client_kwargs = {} + if not agent.quiet_mode: + print(f"🤖 AI Agent initialized with model: {agent.model} (Anthropic native)") + if effective_key and len(effective_key) > 12: + print(f"🔑 Using token: {effective_key[:8]}...{effective_key[-4:]}") + elif agent.api_mode == "bedrock_converse": + # AWS Bedrock — uses boto3 directly, no OpenAI client needed. + # Region is extracted from the base_url or defaults to us-east-1. + _region_match = re.search(r"bedrock-runtime\.([a-z0-9-]+)\.", base_url or "") + agent._bedrock_region = _region_match.group(1) if _region_match else "us-east-1" + # Guardrail config — read from config.yaml at init time. + agent._bedrock_guardrail_config = None + try: + from hermes_cli.config import load_config as _load_br_cfg + _gr = _load_br_cfg().get("bedrock", {}).get("guardrail", {}) + if _gr.get("guardrail_identifier") and _gr.get("guardrail_version"): + agent._bedrock_guardrail_config = { + "guardrailIdentifier": _gr["guardrail_identifier"], + "guardrailVersion": _gr["guardrail_version"], + } + if _gr.get("stream_processing_mode"): + agent._bedrock_guardrail_config["streamProcessingMode"] = _gr["stream_processing_mode"] + if _gr.get("trace"): + agent._bedrock_guardrail_config["trace"] = _gr["trace"] + except Exception: + pass + agent.client = None + agent._client_kwargs = {} + if not agent.quiet_mode: + _gr_label = " + Guardrails" if agent._bedrock_guardrail_config else "" + print(f"🤖 AI Agent initialized with model: {agent.model} (AWS Bedrock, {agent._bedrock_region}{_gr_label})") + else: + if api_key and base_url: + # Explicit credentials from CLI/gateway — construct directly. + # The runtime provider resolver already handled auth for us. + # Extract query params (e.g. Azure api-version) from base_url + # and pass via default_query to prevent loss during SDK URL + # joining (httpx drops query string when joining paths). + _parsed_url = urlparse(base_url) + if _parsed_url.query: + _clean_url = urlunparse(_parsed_url._replace(query="")) + _query_params = { + k: v[0] for k, v in parse_qs(_parsed_url.query).items() + } + client_kwargs = { + "api_key": api_key, + "base_url": _clean_url, + "default_query": _query_params, + } + else: + client_kwargs = {"api_key": api_key, "base_url": base_url} + if _provider_timeout is not None: + client_kwargs["timeout"] = _provider_timeout + if agent.provider == "copilot-acp": + client_kwargs["command"] = agent.acp_command + client_kwargs["args"] = agent.acp_args + effective_base = base_url + if base_url_host_matches(effective_base, "openrouter.ai"): + from agent.auxiliary_client import build_or_headers + client_kwargs["default_headers"] = build_or_headers() + elif base_url_host_matches(effective_base, "integrate.api.nvidia.com"): + from agent.auxiliary_client import build_nvidia_nim_headers + client_kwargs["default_headers"] = build_nvidia_nim_headers(effective_base) + elif base_url_host_matches(effective_base, "api.routermint.com"): + client_kwargs["default_headers"] = _ra()._routermint_headers() + elif base_url_host_matches(effective_base, "api.githubcopilot.com"): + from hermes_cli.models import copilot_default_headers + + client_kwargs["default_headers"] = copilot_default_headers() + elif base_url_host_matches(effective_base, "api.kimi.com"): + client_kwargs["default_headers"] = { + "User-Agent": "claude-code/0.1.0", + } + elif base_url_host_matches(effective_base, "portal.qwen.ai"): + client_kwargs["default_headers"] = _ra()._qwen_portal_headers() + elif base_url_host_matches(effective_base, "chatgpt.com"): + from agent.auxiliary_client import _codex_cloudflare_headers + client_kwargs["default_headers"] = _codex_cloudflare_headers(api_key) + elif "default_headers" not in client_kwargs: + # Fall back to profile.default_headers for providers that + # declare custom headers (e.g. Vercel AI Gateway attribution, + # Kimi User-Agent on non-kimi.com endpoints). + try: + from providers import get_provider_profile as _gpf + _ph = _gpf(agent.provider) + if _ph and _ph.default_headers: + client_kwargs["default_headers"] = dict(_ph.default_headers) + except Exception: + pass + else: + # No explicit creds — use the centralized provider router + from agent.auxiliary_client import resolve_provider_client + _routed_client, _ = resolve_provider_client( + agent.provider or "auto", model=agent.model, raw_codex=True) + if _routed_client is not None: + client_kwargs = { + "api_key": _routed_client.api_key, + "base_url": str(_routed_client.base_url), + } + if _provider_timeout is not None: + client_kwargs["timeout"] = _provider_timeout + # Preserve provider-specific headers the router set. The + # OpenAI SDK stores caller-provided default_headers in + # _custom_headers; older/mocked clients may expose + # _default_headers instead. + _routed_headers = getattr(_routed_client, "_custom_headers", None) + if not _routed_headers: + _routed_headers = getattr(_routed_client, "_default_headers", None) + if _routed_headers: + client_kwargs["default_headers"] = dict(_routed_headers) + else: + # When the user explicitly chose a non-OpenRouter provider + # but no credentials were found, fail fast with a clear + # message instead of silently routing through OpenRouter. + _explicit = (agent.provider or "").strip().lower() + if _explicit and _explicit not in {"auto", "openrouter", "custom"}: + # Look up the actual env var name from the provider + # config — some providers use non-standard names + # (e.g. alibaba → DASHSCOPE_API_KEY, not ALIBABA_API_KEY). + _env_hint = f"{_explicit.upper()}_API_KEY" + try: + from hermes_cli.auth import PROVIDER_REGISTRY + _pcfg = PROVIDER_REGISTRY.get(_explicit) + if _pcfg and _pcfg.api_key_env_vars: + _env_hint = _pcfg.api_key_env_vars[0] + except Exception: + pass + # --- Init-time fallback (#17929) --- + _fb_entries = [] + if isinstance(fallback_model, list): + _fb_entries = [ + f for f in fallback_model + if isinstance(f, dict) and f.get("provider") and f.get("model") + ] + elif isinstance(fallback_model, dict) and fallback_model.get("provider") and fallback_model.get("model"): + _fb_entries = [fallback_model] + _fb_resolved = False + for _fb in _fb_entries: + _fb_explicit_key = (_fb.get("api_key") or "").strip() or None + if not _fb_explicit_key: + _fb_key_env = (_fb.get("key_env") or _fb.get("api_key_env") or "").strip() + if _fb_key_env: + _fb_explicit_key = os.getenv(_fb_key_env, "").strip() or None + _fb_client, _fb_model = resolve_provider_client( + _fb["provider"], model=_fb["model"], raw_codex=True, + explicit_base_url=_fb.get("base_url"), + explicit_api_key=_fb_explicit_key, + ) + if _fb_client is not None: + agent.provider = _fb["provider"] + agent.model = _fb_model or _fb["model"] + agent._fallback_activated = True + client_kwargs = { + "api_key": _fb_client.api_key, + "base_url": str(_fb_client.base_url), + } + if _provider_timeout is not None: + client_kwargs["timeout"] = _provider_timeout + _fb_headers = getattr(_fb_client, "_custom_headers", None) + if not _fb_headers: + _fb_headers = getattr(_fb_client, "_default_headers", None) + if _fb_headers: + client_kwargs["default_headers"] = dict(_fb_headers) + _fb_resolved = True + break + if not _fb_resolved: + raise RuntimeError( + f"Provider '{_explicit}' is set in config.yaml but no API key " + f"was found. Set the {_env_hint} environment " + f"variable, or switch to a different provider with `hermes model`." + ) + if not getattr(agent, "_fallback_activated", False): + # No provider configured — reject with a clear message. + raise RuntimeError( + "No LLM provider configured. Run `hermes model` to " + "select a provider, or run `hermes setup` for first-time " + "configuration." + ) + + agent._client_kwargs = client_kwargs # stored for rebuilding after interrupt + + # Enable fine-grained tool streaming for Claude on OpenRouter. + # Without this, Anthropic buffers the entire tool call and goes + # silent for minutes while thinking — OpenRouter's upstream proxy + # times out during the silence. The beta header makes Anthropic + # stream tool call arguments token-by-token, keeping the + # connection alive. + _effective_base = str(client_kwargs.get("base_url", "")).lower() + if base_url_host_matches(_effective_base, "openrouter.ai") and "claude" in (agent.model or "").lower(): + headers = client_kwargs.get("default_headers") or {} + existing_beta = headers.get("x-anthropic-beta", "") + _FINE_GRAINED = "fine-grained-tool-streaming-2025-05-14" + if _FINE_GRAINED not in existing_beta: + if existing_beta: + headers["x-anthropic-beta"] = f"{existing_beta},{_FINE_GRAINED}" + else: + headers["x-anthropic-beta"] = _FINE_GRAINED + client_kwargs["default_headers"] = headers + + agent.api_key = client_kwargs.get("api_key", "") + agent.base_url = client_kwargs.get("base_url", agent.base_url) + try: + agent.client = agent._create_openai_client(client_kwargs, reason="agent_init", shared=True) + if not agent.quiet_mode: + print(f"🤖 AI Agent initialized with model: {agent.model}") + if base_url: + print(f"🔗 Using custom base URL: {base_url}") + # Always show API key info (masked) for debugging auth issues + key_used = client_kwargs.get("api_key", "none") + if key_used and key_used != "dummy-key" and len(key_used) > 12: + print(f"🔑 Using API key: {key_used[:8]}...{key_used[-4:]}") + else: + print(f"⚠️ Warning: API key appears invalid or missing (got: '{key_used[:20] if key_used else 'none'}...')") + except Exception as e: + raise RuntimeError(f"Failed to initialize OpenAI client: {e}") + + # Provider fallback chain — ordered list of backup providers tried + # when the primary is exhausted (rate-limit, overload, connection + # failure). Supports both legacy single-dict ``fallback_model`` and + # new list ``fallback_providers`` format. + if isinstance(fallback_model, list): + agent._fallback_chain = [ + f for f in fallback_model + if isinstance(f, dict) and f.get("provider") and f.get("model") + ] + elif isinstance(fallback_model, dict) and fallback_model.get("provider") and fallback_model.get("model"): + agent._fallback_chain = [fallback_model] + else: + agent._fallback_chain = [] + agent._fallback_index = 0 + agent._fallback_activated = getattr(agent, "_fallback_activated", False) + # Legacy attribute kept for backward compat (tests, external callers) + agent._fallback_model = agent._fallback_chain[0] if agent._fallback_chain else None + if agent._fallback_chain and not agent.quiet_mode: + if len(agent._fallback_chain) == 1: + fb = agent._fallback_chain[0] + print(f"🔄 Fallback model: {fb['model']} ({fb['provider']})") + else: + print(f"🔄 Fallback chain ({len(agent._fallback_chain)} providers): " + + " → ".join(f"{f['model']} ({f['provider']})" for f in agent._fallback_chain)) + + # Get available tools with filtering + agent.tools = _ra().get_tool_definitions( + enabled_toolsets=enabled_toolsets, + disabled_toolsets=disabled_toolsets, + quiet_mode=agent.quiet_mode, + ) + + # Show tool configuration and store valid tool names for validation + agent.valid_tool_names = set() + if agent.tools: + agent.valid_tool_names = {tool["function"]["name"] for tool in agent.tools} + tool_names = sorted(agent.valid_tool_names) + if not agent.quiet_mode: + print(f"🛠️ Loaded {len(agent.tools)} tools: {', '.join(tool_names)}") + + # Show filtering info if applied + if enabled_toolsets: + print(f" ✅ Enabled toolsets: {', '.join(enabled_toolsets)}") + if disabled_toolsets: + print(f" ❌ Disabled toolsets: {', '.join(disabled_toolsets)}") + elif not agent.quiet_mode: + print("🛠️ No tools loaded (all tools filtered out or unavailable)") + + # Check tool requirements + if agent.tools and not agent.quiet_mode: + requirements = _ra().check_toolset_requirements() + missing_reqs = [name for name, available in requirements.items() if not available] + if missing_reqs: + print(f"⚠️ Some tools may not work due to missing requirements: {missing_reqs}") + + # Show trajectory saving status + if agent.save_trajectories and not agent.quiet_mode: + print("📝 Trajectory saving enabled") + + # Show ephemeral system prompt status + if agent.ephemeral_system_prompt and not agent.quiet_mode: + prompt_preview = agent.ephemeral_system_prompt[:60] + "..." if len(agent.ephemeral_system_prompt) > 60 else agent.ephemeral_system_prompt + print(f"🔒 Ephemeral system prompt: '{prompt_preview}' (not saved to trajectories)") + + # Show prompt caching status + if agent._use_prompt_caching and not agent.quiet_mode: + if agent._use_native_cache_layout and agent.provider == "anthropic": + source = "native Anthropic" + elif agent._use_native_cache_layout: + source = "Anthropic-compatible endpoint" + else: + source = "Claude via OpenRouter" + print(f"💾 Prompt caching: ENABLED ({source}, {agent._cache_ttl} TTL)") + + # Session logging setup - auto-save conversation trajectories for debugging + agent.session_start = datetime.now() + if session_id: + # Use provided session ID (e.g., from CLI) + agent.session_id = session_id + else: + # Generate a new session ID + timestamp_str = agent.session_start.strftime("%Y%m%d_%H%M%S") + short_uuid = uuid.uuid4().hex[:6] + agent.session_id = f"{timestamp_str}_{short_uuid}" + + # Expose session ID to tools (terminal, execute_code) so agents can + # reference their own session for --resume commands, cross-session + # coordination, and logging. Uses the ContextVar system from + # session_context.py for concurrency safety (gateway runs multiple + # sessions in one process). Also writes os.environ as fallback for + # CLI mode where ContextVars aren't used. + os.environ["HERMES_SESSION_ID"] = agent.session_id + try: + from gateway.session_context import _SESSION_ID + _SESSION_ID.set(agent.session_id) + except Exception: + pass # CLI/test mode — ContextVar not needed + + # Session logs go into ~/.hermes/sessions/ alongside gateway sessions + hermes_home = get_hermes_home() + agent.logs_dir = hermes_home / "sessions" + agent.logs_dir.mkdir(parents=True, exist_ok=True) + agent.session_log_file = agent.logs_dir / f"session_{agent.session_id}.json" + + # Track conversation messages for session logging + agent._session_messages: List[Dict[str, Any]] = [] + agent._memory_write_origin = "assistant_tool" + agent._memory_write_context = "foreground" + + # Cached system prompt -- built once per session, only rebuilt on compression + agent._cached_system_prompt: Optional[str] = None + + # Filesystem checkpoint manager (transparent — not a tool) + from tools.checkpoint_manager import CheckpointManager + agent._checkpoint_mgr = CheckpointManager( + enabled=checkpoints_enabled, + max_snapshots=checkpoint_max_snapshots, + max_total_size_mb=checkpoint_max_total_size_mb, + max_file_size_mb=checkpoint_max_file_size_mb, + ) + + # SQLite session store (optional -- provided by CLI or gateway) + agent._session_db = session_db + agent._parent_session_id = parent_session_id + agent._last_flushed_db_idx = 0 # tracks DB-write cursor to prevent duplicate writes + agent._session_db_created = False # DB row deferred to run_conversation() + agent._session_init_model_config = { + "max_iterations": agent.max_iterations, + "reasoning_config": reasoning_config, + "max_tokens": max_tokens, + } + + # In-memory todo list for task planning (one per agent/session) + from tools.todo_tool import TodoStore + agent._todo_store = TodoStore() + + # Load config once for memory, skills, and compression sections + try: + from hermes_cli.config import load_config as _load_agent_config + _agent_cfg = _load_agent_config() + except Exception: + _agent_cfg = {} + try: + agent._tool_guardrails = ToolCallGuardrailController( + ToolCallGuardrailConfig.from_mapping( + _agent_cfg.get("tool_loop_guardrails", {}) + ) + ) + except Exception as _tlg_err: + _ra().logger.warning("Tool loop guardrail config ignored: %s", _tlg_err) + # Cache only the derived auxiliary compression context override that is + # needed later by the startup feasibility check. Avoid exposing a + # broad pseudo-public config object on the agent instance. + agent._aux_compression_context_length_config = None + + # Persistent memory (MEMORY.md + USER.md) -- loaded from disk + agent._memory_store = None + agent._memory_enabled = False + agent._user_profile_enabled = False + agent._memory_nudge_interval = 10 + agent._turns_since_memory = 0 + agent._iters_since_skill = 0 + if not skip_memory: + try: + mem_config = _agent_cfg.get("memory", {}) + agent._memory_enabled = mem_config.get("memory_enabled", False) + agent._user_profile_enabled = mem_config.get("user_profile_enabled", False) + agent._memory_nudge_interval = int(mem_config.get("nudge_interval", 10)) + if agent._memory_enabled or agent._user_profile_enabled: + from tools.memory_tool import MemoryStore + agent._memory_store = MemoryStore( + memory_char_limit=mem_config.get("memory_char_limit", 2200), + user_char_limit=mem_config.get("user_char_limit", 1375), + ) + agent._memory_store.load_from_disk() + except Exception: + pass # Memory is optional -- don't break agent init + + + + # Memory provider plugin (external — one at a time, alongside built-in) + # Reads memory.provider from config to select which plugin to activate. + agent._memory_manager = None + if not skip_memory: + try: + _mem_provider_name = mem_config.get("provider", "") if mem_config else "" + + if _mem_provider_name and _mem_provider_name.strip(): + from agent.memory_manager import MemoryManager as _MemoryManager + from plugins.memory import load_memory_provider as _load_mem + agent._memory_manager = _MemoryManager() + _mp = _load_mem(_mem_provider_name) + if _mp and _mp.is_available(): + agent._memory_manager.add_provider(_mp) + if agent._memory_manager.providers: + _init_kwargs = { + "session_id": agent.session_id, + "platform": platform or "cli", + "hermes_home": str(get_hermes_home()), + "agent_context": "primary", + } + # Thread session title for memory provider scoping + # (e.g. honcho uses this to derive chat-scoped session keys) + if agent._session_db: + try: + _st = agent._session_db.get_session_title(agent.session_id) + if _st: + _init_kwargs["session_title"] = _st + except Exception: + pass + # Thread gateway user identity for per-user memory scoping + if agent._user_id: + _init_kwargs["user_id"] = agent._user_id + if agent._user_name: + _init_kwargs["user_name"] = agent._user_name + if agent._chat_id: + _init_kwargs["chat_id"] = agent._chat_id + if agent._chat_name: + _init_kwargs["chat_name"] = agent._chat_name + if agent._chat_type: + _init_kwargs["chat_type"] = agent._chat_type + if agent._thread_id: + _init_kwargs["thread_id"] = agent._thread_id + # Thread gateway session key for stable per-chat Honcho session isolation + if agent._gateway_session_key: + _init_kwargs["gateway_session_key"] = agent._gateway_session_key + # Profile identity for per-profile provider scoping + try: + from hermes_cli.profiles import get_active_profile_name + _profile = get_active_profile_name() + _init_kwargs["agent_identity"] = _profile + _init_kwargs["agent_workspace"] = "hermes" + except Exception: + pass + agent._memory_manager.initialize_all(**_init_kwargs) + _ra().logger.info("Memory provider '%s' activated", _mem_provider_name) + else: + _ra().logger.debug("Memory provider '%s' not found or not available", _mem_provider_name) + agent._memory_manager = None + except Exception as _mpe: + _ra().logger.warning("Memory provider plugin init failed: %s", _mpe) + agent._memory_manager = None + + # Inject memory provider tool schemas into the tool surface. + # Skip tools whose names already exist (plugins may register the + # same tools via ctx.register_tool(), which lands in agent.tools + # through _ra().get_tool_definitions()). Duplicate function names cause + # 400 errors on providers that enforce unique names (e.g. Xiaomi + # MiMo via Nous Portal). + if agent._memory_manager and agent.tools is not None: + _existing_tool_names = { + t.get("function", {}).get("name") + for t in agent.tools + if isinstance(t, dict) + } + for _schema in agent._memory_manager.get_all_tool_schemas(): + _tname = _schema.get("name", "") + if _tname and _tname in _existing_tool_names: + continue # already registered via plugin path + _wrapped = {"type": "function", "function": _schema} + agent.tools.append(_wrapped) + if _tname: + agent.valid_tool_names.add(_tname) + _existing_tool_names.add(_tname) + + # Skills config: nudge interval for skill creation reminders + agent._skill_nudge_interval = 10 + try: + skills_config = _agent_cfg.get("skills", {}) + agent._skill_nudge_interval = int(skills_config.get("creation_nudge_interval", 10)) + except Exception: + pass + + # Tool-use enforcement config: "auto" (default — matches hardcoded + # model list), true (always), false (never), or list of substrings. + _agent_section = _agent_cfg.get("agent", {}) + if not isinstance(_agent_section, dict): + _agent_section = {} + agent._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto") + + # App-level API retry count (wraps each model API call). Default 3, + # overridable via agent.api_max_retries in config.yaml. See #11616. + try: + _raw_api_retries = _agent_section.get("api_max_retries", 3) + _api_retries = int(_raw_api_retries) + _api_retries = max(_api_retries, 1) # 1 = no retry (single attempt) + except (TypeError, ValueError): + _api_retries = 3 + agent._api_max_retries = _api_retries + + # Initialize context compressor for automatic context management + # Compresses conversation when approaching model's context limit + # Configuration via config.yaml (compression section) + _compression_cfg = _agent_cfg.get("compression", {}) + if not isinstance(_compression_cfg, dict): + _compression_cfg = {} + compression_threshold = float(_compression_cfg.get("threshold", 0.50)) + try: + from agent.auxiliary_client import _compression_threshold_for_model as _cthresh_fn + _model_cthresh = _cthresh_fn(agent.model) + if _model_cthresh is not None: + compression_threshold = _model_cthresh + except Exception: + pass + compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in {"true", "1", "yes"} + compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20)) + compression_protect_last = int(_compression_cfg.get("protect_last_n", 20)) + # protect_first_n is the number of non-system messages to protect at + # the head, in addition to the system prompt (which is always + # implicitly protected by the compressor). Floor at 0 — a value of + # 0 means "preserve only the system prompt + summary + tail", which + # is a legitimate (and common) configuration for long-running + # rolling-compaction sessions. + compression_protect_first = max( + 0, int(_compression_cfg.get("protect_first_n", 3)) + ) + + # Read optional explicit context_length override for the auxiliary + # compression model. Custom endpoints often cannot report this via + # /models, so the startup feasibility check needs the config hint. + try: + _aux_cfg = cfg_get(_agent_cfg, "auxiliary", "compression", default={}) + except Exception: + _aux_cfg = {} + if isinstance(_aux_cfg, dict): + _aux_context_config = _aux_cfg.get("context_length") + else: + _aux_context_config = None + if _aux_context_config is not None: + try: + _aux_context_config = int(_aux_context_config) + except (TypeError, ValueError): + _aux_context_config = None + agent._aux_compression_context_length_config = _aux_context_config + + # Read explicit model output-token override from config when the + # caller did not pass one directly. + _model_cfg = _agent_cfg.get("model", {}) + if agent.max_tokens is None and isinstance(_model_cfg, dict): + _config_max_tokens = _model_cfg.get("max_tokens") + if _config_max_tokens is not None: + try: + if isinstance(_config_max_tokens, bool): + raise ValueError + _parsed_max_tokens = int(_config_max_tokens) + if _parsed_max_tokens <= 0: + raise ValueError + agent.max_tokens = _parsed_max_tokens + except (TypeError, ValueError): + _ra().logger.warning( + "Invalid model.max_tokens in config.yaml: %r — " + "must be a positive integer (e.g. 4096). " + "Falling back to provider default.", + _config_max_tokens, + ) + print( + f"\n⚠ Invalid model.max_tokens in config.yaml: {_config_max_tokens!r}\n" + f" Must be a positive integer (e.g. 4096).\n" + f" Falling back to provider default.\n", + file=sys.stderr, + ) + agent._session_init_model_config["max_tokens"] = agent.max_tokens + + # Read explicit context_length override from model config + if isinstance(_model_cfg, dict): + _config_context_length = _model_cfg.get("context_length") + else: + _config_context_length = None + if _config_context_length is not None: + try: + _config_context_length = int(_config_context_length) + except (TypeError, ValueError): + _ra().logger.warning( + "Invalid model.context_length in config.yaml: %r — " + "must be a plain integer (e.g. 256000, not '256K'). " + "Falling back to auto-detection.", + _config_context_length, + ) + print( + f"\n⚠ Invalid model.context_length in config.yaml: {_config_context_length!r}\n" + f" Must be a plain integer (e.g. 256000, not '256K').\n" + f" Falling back to auto-detected context window.\n", + file=sys.stderr, + ) + _config_context_length = None + + # Resolve custom_providers list once for reuse below (startup + # context-length override and plugin context-engine init). + try: + from hermes_cli.config import get_compatible_custom_providers + _custom_providers = get_compatible_custom_providers(_agent_cfg) + except Exception: + _custom_providers = _agent_cfg.get("custom_providers") + if not isinstance(_custom_providers, list): + _custom_providers = [] + + # Store for reuse by _check_compression_model_feasibility (auxiliary + # compression model context-length detection needs the same list). + agent._custom_providers = _custom_providers + + # Check custom_providers per-model context_length + if _config_context_length is None and _custom_providers: + try: + from hermes_cli.config import get_custom_provider_context_length + _cp_ctx_resolved = get_custom_provider_context_length( + model=agent.model, + base_url=agent.base_url, + custom_providers=_custom_providers, + ) + if _cp_ctx_resolved: + _config_context_length = int(_cp_ctx_resolved) + except Exception: + _cp_ctx_resolved = None + + # Surface a clear warning if the user set a context_length but it + # wasn't a valid positive int — the helper silently skips those. + if _config_context_length is None: + _target = agent.base_url.rstrip("/") if agent.base_url else "" + for _cp_entry in _custom_providers: + if not isinstance(_cp_entry, dict): + continue + _cp_url = (_cp_entry.get("base_url") or "").rstrip("/") + if _target and _cp_url == _target: + _cp_models = _cp_entry.get("models", {}) + if isinstance(_cp_models, dict): + _cp_model_cfg = _cp_models.get(agent.model, {}) + if isinstance(_cp_model_cfg, dict): + _cp_ctx = _cp_model_cfg.get("context_length") + if _cp_ctx is not None: + try: + _parsed = int(_cp_ctx) + if _parsed <= 0: + raise ValueError + except (TypeError, ValueError): + _ra().logger.warning( + "Invalid context_length for model %r in " + "custom_providers: %r — must be a positive " + "integer (e.g. 256000, not '256K'). " + "Falling back to auto-detection.", + agent.model, _cp_ctx, + ) + print( + f"\n⚠ Invalid context_length for model {agent.model!r} in custom_providers: {_cp_ctx!r}\n" + f" Must be a positive integer (e.g. 256000, not '256K').\n" + f" Falling back to auto-detected context window.\n", + file=sys.stderr, + ) + break + + # Persist for reuse on switch_model / fallback activation. Must come + # AFTER the custom_providers branch so per-model overrides aren't lost. + agent._config_context_length = _config_context_length + + agent._ensure_lmstudio_runtime_loaded(_config_context_length) + + + + # Select context engine: config-driven (like memory providers). + # 1. Check config.yaml context.engine setting + # 2. Check plugins/context_engine// directory (repo-shipped) + # 3. Check general plugin system (user-installed plugins) + # 4. Fall back to built-in ContextCompressor + _selected_engine = None + _engine_name = "compressor" # default + try: + _ctx_cfg = _agent_cfg.get("context", {}) if isinstance(_agent_cfg, dict) else {} + _engine_name = _ctx_cfg.get("engine", "compressor") or "compressor" + except Exception: + pass + + if _engine_name != "compressor": + # Try loading from plugins/context_engine// + try: + from plugins.context_engine import load_context_engine + _selected_engine = load_context_engine(_engine_name) + except Exception as _ce_load_err: + _ra().logger.debug("Context engine load from plugins/context_engine/: %s", _ce_load_err) + + # Try general plugin system as fallback + if _selected_engine is None: + try: + from hermes_cli.plugins import get_plugin_context_engine + _candidate = get_plugin_context_engine() + if _candidate and _candidate.name == _engine_name: + _selected_engine = _candidate + except Exception: + pass + + if _selected_engine is None: + _ra().logger.warning( + "Context engine '%s' not found — falling back to built-in compressor", + _engine_name, + ) + # else: config says "compressor" — use built-in, don't auto-activate plugins + + if _selected_engine is not None: + agent.context_compressor = _selected_engine + # Resolve context_length for plugin engines — mirrors switch_model() path + from agent.model_metadata import get_model_context_length + _plugin_ctx_len = get_model_context_length( + agent.model, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + config_context_length=_config_context_length, + provider=agent.provider, + custom_providers=_custom_providers, + ) + agent.context_compressor.update_model( + model=agent.model, + context_length=_plugin_ctx_len, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + provider=agent.provider, + ) + if not agent.quiet_mode: + _ra().logger.info("Using context engine: %s", _selected_engine.name) + else: + agent.context_compressor = ContextCompressor( + model=agent.model, + threshold_percent=compression_threshold, + protect_first_n=compression_protect_first, + protect_last_n=compression_protect_last, + summary_target_ratio=compression_target_ratio, + summary_model_override=None, + quiet_mode=agent.quiet_mode, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + config_context_length=_config_context_length, + provider=agent.provider, + api_mode=agent.api_mode, + ) + agent.compression_enabled = compression_enabled + + # Reject models whose context window is below the minimum required + # for reliable tool-calling workflows (64K tokens). + from agent.model_metadata import MINIMUM_CONTEXT_LENGTH + _ctx = getattr(agent.context_compressor, "context_length", 0) + if _ctx and _ctx < MINIMUM_CONTEXT_LENGTH: + raise ValueError( + f"Model {agent.model} has a context window of {_ctx:,} tokens, " + f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required " + f"by Hermes Agent. Choose a model with at least " + f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set " + f"model.context_length in config.yaml to override." + ) + + # Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand). + # Skip names that are already present — the _ra().get_tool_definitions() + # quiet_mode cache returned a shared list pre-#17335, so a stray + # mutation here would poison subsequent agent inits in the same + # Gateway process and trip provider-side 'duplicate tool name' + # errors. Even with the cache fix, dedup is the right defense + # against plugin paths that may register the same schemas via + # ctx.register_tool(). Mirrors the memory tools dedup above. + agent._context_engine_tool_names: set = set() + if hasattr(agent, "context_compressor") and agent.context_compressor and agent.tools is not None: + _existing_tool_names = { + t.get("function", {}).get("name") + for t in agent.tools + if isinstance(t, dict) + } + for _schema in agent.context_compressor.get_tool_schemas(): + _tname = _schema.get("name", "") + if _tname and _tname in _existing_tool_names: + continue # already registered via plugin/cache path + _wrapped = {"type": "function", "function": _schema} + agent.tools.append(_wrapped) + if _tname: + agent.valid_tool_names.add(_tname) + agent._context_engine_tool_names.add(_tname) + _existing_tool_names.add(_tname) + + # Notify context engine of session start + if hasattr(agent, "context_compressor") and agent.context_compressor: + try: + agent.context_compressor.on_session_start( + agent.session_id, + hermes_home=str(get_hermes_home()), + platform=agent.platform or "cli", + model=agent.model, + context_length=getattr(agent.context_compressor, "context_length", 0), + ) + except Exception as _ce_err: + _ra().logger.debug("Context engine on_session_start: %s", _ce_err) + + agent._subdirectory_hints = SubdirectoryHintTracker( + working_dir=os.getenv("TERMINAL_CWD") or None, + ) + agent._user_turn_count = 0 + + # Cumulative token usage for the session + agent.session_prompt_tokens = 0 + agent.session_completion_tokens = 0 + agent.session_total_tokens = 0 + agent.session_api_calls = 0 + agent.session_input_tokens = 0 + agent.session_output_tokens = 0 + agent.session_cache_read_tokens = 0 + agent.session_cache_write_tokens = 0 + agent.session_reasoning_tokens = 0 + agent.session_estimated_cost_usd = 0.0 + agent.session_cost_status = "unknown" + agent.session_cost_source = "none" + + # ── Ollama num_ctx injection ── + # Ollama defaults to 2048 context regardless of the model's capabilities. + # When running against an Ollama server, detect the model's max context + # and pass num_ctx on every chat request so the full window is used. + # User override: set model.ollama_num_ctx in config.yaml to cap VRAM use. + # If model.context_length is set, it caps num_ctx so the user's VRAM + # budget is respected even when GGUF metadata advertises a larger window. + agent._ollama_num_ctx: int | None = None + _ollama_num_ctx_override = None + if isinstance(_model_cfg, dict): + _ollama_num_ctx_override = _model_cfg.get("ollama_num_ctx") + if _ollama_num_ctx_override is not None: + try: + agent._ollama_num_ctx = int(_ollama_num_ctx_override) + except (TypeError, ValueError): + _ra().logger.debug("Invalid ollama_num_ctx config value: %r", _ollama_num_ctx_override) + if agent._ollama_num_ctx is None and agent.base_url and is_local_endpoint(agent.base_url): + try: + _detected = query_ollama_num_ctx(agent.model, agent.base_url, api_key=agent.api_key or "") + if _detected and _detected > 0: + agent._ollama_num_ctx = _detected + except Exception as exc: + _ra().logger.debug("Ollama num_ctx detection failed: %s", exc) + # Cap auto-detected ollama_num_ctx to the user's explicit context_length. + # Without this, GGUF metadata can advertise 256K+ which Ollama honours + # by allocating that much VRAM — blowing up small GPUs even though the + # user explicitly set a smaller context_length in config.yaml. + if ( + agent._ollama_num_ctx + and _config_context_length + and _ollama_num_ctx_override is None # don't override explicit ollama_num_ctx + and agent._ollama_num_ctx > _config_context_length + ): + _ra().logger.info( + "Ollama num_ctx capped: %d -> %d (model.context_length override)", + agent._ollama_num_ctx, _config_context_length, + ) + agent._ollama_num_ctx = _config_context_length + if agent._ollama_num_ctx and not agent.quiet_mode: + _ra().logger.info( + "Ollama num_ctx: will request %d tokens (model max from /api/show)", + agent._ollama_num_ctx, + ) + + if not agent.quiet_mode: + if compression_enabled: + print(f"📊 Context limit: {agent.context_compressor.context_length:,} tokens (compress at {int(compression_threshold*100)}% = {agent.context_compressor.threshold_tokens:,})") + else: + print(f"📊 Context limit: {agent.context_compressor.context_length:,} tokens (auto-compression disabled)") + + # Check immediately so CLI users see the warning at startup. + # Gateway status_callback is not yet wired, so any warning is stored + # in _compression_warning and replayed in the first run_conversation(). + agent._compression_warning = None + agent._check_compression_model_feasibility() + + # Snapshot primary runtime for per-turn restoration. When fallback + # activates during a turn, the next turn restores these values so the + # preferred model gets a fresh attempt each time. Uses a single dict + # so new state fields are easy to add without N individual attributes. + _cc = agent.context_compressor + agent._primary_runtime = { + "model": agent.model, + "provider": agent.provider, + "base_url": agent.base_url, + "api_mode": agent.api_mode, + "api_key": getattr(agent, "api_key", ""), + "client_kwargs": dict(agent._client_kwargs), + "use_prompt_caching": agent._use_prompt_caching, + "use_native_cache_layout": agent._use_native_cache_layout, + # Context engine state that _try_activate_fallback() overwrites. + # Use getattr for model/base_url/api_key/provider since plugin + # engines may not have these (they're ContextCompressor-specific). + "compressor_model": getattr(_cc, "model", agent.model), + "compressor_base_url": getattr(_cc, "base_url", agent.base_url), + "compressor_api_key": getattr(_cc, "api_key", ""), + "compressor_provider": getattr(_cc, "provider", agent.provider), + "compressor_context_length": _cc.context_length, + "compressor_threshold_tokens": _cc.threshold_tokens, + } + if agent.api_mode == "anthropic_messages": + agent._primary_runtime.update({ + "anthropic_api_key": agent._anthropic_api_key, + "anthropic_base_url": agent._anthropic_base_url, + "is_anthropic_oauth": agent._is_anthropic_oauth, + }) + + + +__all__ = ["init_agent"] diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py new file mode 100644 index 00000000000..b5c70392946 --- /dev/null +++ b/agent/agent_runtime_helpers.py @@ -0,0 +1,2134 @@ +"""Assorted AIAgent runtime helpers — moved out of run_agent.py for clarity. + +Each function takes the parent ``AIAgent`` as its first argument +(``agent``) except for the static helpers (``sanitize_tool_call_arguments``, +``drop_thinking_only_and_merge_users``) which are stateless. AIAgent +keeps thin forwarders for backward compatibility. + +Methods covered: +* ``convert_to_trajectory_format`` — internal -> trajectory-file format +* ``sanitize_tool_call_arguments`` — repair corrupted JSON in tool_calls +* ``repair_message_sequence`` — enforce alternation invariants +* ``strip_think_blocks`` — remove inline reasoning from stored content +* ``recover_with_credential_pool`` — rotate pool entries on 429 +* ``try_recover_primary_transport`` — re-create OpenAI client after rate-limit +* ``drop_thinking_only_and_merge_users`` — Anthropic-style cleanup +* ``restore_primary_runtime`` — un-do fallback activation +* ``extract_reasoning`` — pull reasoning fields out of API responses +* ``dump_api_request_debug`` — write request body for post-mortem +* ``anthropic_prompt_cache_policy`` — compute cache_control breakpoints +* ``create_openai_client`` — build the per-agent OpenAI SDK client +""" + +from __future__ import annotations + +import copy +import json +import logging +import os +import re +import threading +import time +import uuid +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from hermes_cli.timeouts import get_provider_request_timeout +from agent.message_sanitization import ( + _repair_tool_call_arguments, + _sanitize_surrogates, +) +from agent.tool_dispatch_helpers import _trajectory_normalize_msg +from agent.trajectory import convert_scratchpad_to_think +from agent.error_classifier import classify_api_error, FailoverReason +from utils import base_url_host_matches, base_url_hostname, env_var_enabled, atomic_json_write + +logger = logging.getLogger(__name__) + + +def _ra(): + """Lazy ``run_agent`` reference for test-patch routing.""" + import run_agent + return run_agent + + + +def convert_to_trajectory_format(agent, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]: + """ + Convert internal message format to trajectory format for saving. + + Args: + messages (List[Dict]): Internal message history + user_query (str): Original user query + completed (bool): Whether the conversation completed successfully + + Returns: + List[Dict]: Messages in trajectory format + """ + # Normalize multimodal tool results — trajectories are text-only, so + # replace image-bearing tool messages with their text_summary to avoid + # embedding ~1MB base64 blobs into every saved trajectory. + messages = [_trajectory_normalize_msg(m) for m in messages] + trajectory = [] + + # Add system message with tool definitions + system_msg = ( + "You are a function calling AI model. You are provided with function signatures within XML tags. " + "You may call one or more functions to assist with the user query. If available tools are not relevant in assisting " + "with user query, just respond in natural conversational language. Don't make assumptions about what values to plug " + "into functions. After calling & executing the functions, you will be provided with function results within " + " XML tags. Here are the available tools:\n" + f"\n{agent._format_tools_for_system_message()}\n\n" + "For each function call return a JSON object, with the following pydantic model json schema for each:\n" + "{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, " + "'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n" + "Each function call should be enclosed within XML tags.\n" + "Example:\n\n{'name': ,'arguments': }\n" + ) + + trajectory.append({ + "from": "system", + "value": system_msg + }) + + # Add the actual user prompt (from the dataset) as the first human message + trajectory.append({ + "from": "human", + "value": user_query + }) + + # Skip the first message (the user query) since we already added it above. + # Prefill messages are injected at API-call time only (not in the messages + # list), so no offset adjustment is needed here. + i = 1 + + while i < len(messages): + msg = messages[i] + + if msg["role"] == "assistant": + # Check if this message has tool calls + if "tool_calls" in msg and msg["tool_calls"]: + # Format assistant message with tool calls + # Add tags around reasoning for trajectory storage + content = "" + + # Prepend reasoning in tags if available (native thinking tokens) + if msg.get("reasoning") and msg["reasoning"].strip(): + content = f"\n{msg['reasoning']}\n\n" + + if msg.get("content") and msg["content"].strip(): + # Convert any tags to tags + # (used when native thinking is disabled and model reasons via XML) + content += convert_scratchpad_to_think(msg["content"]) + "\n" + + # Add tool calls wrapped in XML tags + for tool_call in msg["tool_calls"]: + if not tool_call or not isinstance(tool_call, dict): continue + # Parse arguments - should always succeed since we validate during conversation + # but keep try-except as safety net + try: + arguments = json.loads(tool_call["function"]["arguments"]) if isinstance(tool_call["function"]["arguments"], str) else tool_call["function"]["arguments"] + except json.JSONDecodeError: + # This shouldn't happen since we validate and retry during conversation, + # but if it does, log warning and use empty dict + logging.warning(f"Unexpected invalid JSON in trajectory conversion: {tool_call['function']['arguments'][:100]}") + arguments = {} + + tool_call_json = { + "name": tool_call["function"]["name"], + "arguments": arguments + } + content += f"\n{json.dumps(tool_call_json, ensure_ascii=False)}\n\n" + + # Ensure every gpt turn has a block (empty if no reasoning) + # so the format is consistent for training data + if "" not in content: + content = "\n\n" + content + + trajectory.append({ + "from": "gpt", + "value": content.rstrip() + }) + + # Collect all subsequent tool responses + tool_responses = [] + j = i + 1 + while j < len(messages) and messages[j]["role"] == "tool": + tool_msg = messages[j] + # Format tool response with XML tags + tool_response = "\n" + + # Try to parse tool content as JSON if it looks like JSON + tool_content = tool_msg["content"] + try: + if tool_content.strip().startswith(("{", "[")): + tool_content = json.loads(tool_content) + except (json.JSONDecodeError, AttributeError): + pass # Keep as string if not valid JSON + + tool_index = len(tool_responses) + tool_name = ( + msg["tool_calls"][tool_index]["function"]["name"] + if tool_index < len(msg["tool_calls"]) + else "unknown" + ) + tool_response += json.dumps({ + "tool_call_id": tool_msg.get("tool_call_id", ""), + "name": tool_name, + "content": tool_content + }, ensure_ascii=False) + tool_response += "\n" + tool_responses.append(tool_response) + j += 1 + + # Add all tool responses as a single message + if tool_responses: + trajectory.append({ + "from": "tool", + "value": "\n".join(tool_responses) + }) + i = j - 1 # Skip the tool messages we just processed + + else: + # Regular assistant message without tool calls + # Add tags around reasoning for trajectory storage + content = "" + + # Prepend reasoning in tags if available (native thinking tokens) + if msg.get("reasoning") and msg["reasoning"].strip(): + content = f"\n{msg['reasoning']}\n\n" + + # Convert any tags to tags + # (used when native thinking is disabled and model reasons via XML) + raw_content = msg["content"] or "" + content += convert_scratchpad_to_think(raw_content) + + # Ensure every gpt turn has a block (empty if no reasoning) + if "" not in content: + content = "\n\n" + content + + trajectory.append({ + "from": "gpt", + "value": content.strip() + }) + + elif msg["role"] == "user": + trajectory.append({ + "from": "human", + "value": msg["content"] + }) + + i += 1 + + return trajectory + + + +def sanitize_tool_call_arguments( + messages: list, + *, + logger=None, + session_id: str = None, +) -> int: + """Repair corrupted assistant tool-call argument JSON in-place.""" + log = logger or logging.getLogger(__name__) + if not isinstance(messages, list): + return 0 + + repaired = 0 + marker = _ra().AIAgent._TOOL_CALL_ARGUMENTS_CORRUPTION_MARKER + + def _prepend_marker(tool_msg: dict) -> None: + existing = tool_msg.get("content") + if isinstance(existing, str): + if not existing: + tool_msg["content"] = marker + elif not existing.startswith(marker): + tool_msg["content"] = f"{marker}\n{existing}" + return + if existing is None: + tool_msg["content"] = marker + return + try: + existing_text = json.dumps(existing) + except TypeError: + existing_text = str(existing) + tool_msg["content"] = f"{marker}\n{existing_text}" + + message_index = 0 + while message_index < len(messages): + msg = messages[message_index] + if not isinstance(msg, dict) or msg.get("role") != "assistant": + message_index += 1 + continue + + tool_calls = msg.get("tool_calls") + if not isinstance(tool_calls, list) or not tool_calls: + message_index += 1 + continue + + insert_at = message_index + 1 + for tool_call in tool_calls: + if not isinstance(tool_call, dict): + continue + function = tool_call.get("function") + if not isinstance(function, dict): + continue + + arguments = function.get("arguments") + if arguments is None or arguments == "": + function["arguments"] = "{}" + continue + if isinstance(arguments, str) and not arguments.strip(): + function["arguments"] = "{}" + continue + if not isinstance(arguments, str): + continue + + try: + json.loads(arguments) + except json.JSONDecodeError: + tool_call_id = tool_call.get("id") + function_name = function.get("name", "?") + preview = arguments[:80] + log.warning( + "Corrupted tool_call arguments repaired before request " + "(session=%s, message_index=%s, tool_call_id=%s, function=%s, preview=%r)", + session_id or "-", + message_index, + tool_call_id or "-", + function_name, + preview, + ) + function["arguments"] = "{}" + + existing_tool_msg = None + scan_index = message_index + 1 + while scan_index < len(messages): + candidate = messages[scan_index] + if not isinstance(candidate, dict) or candidate.get("role") != "tool": + break + if candidate.get("tool_call_id") == tool_call_id: + existing_tool_msg = candidate + break + scan_index += 1 + + if existing_tool_msg is None: + messages.insert( + insert_at, + { + "role": "tool", + "name": function_name if function_name != "?" else "", + "tool_call_id": tool_call_id, + "content": marker, + }, + ) + insert_at += 1 + else: + _prepend_marker(existing_tool_msg) + + repaired += 1 + + message_index += 1 + + return repaired + + + +def repair_message_sequence(agent, messages: List[Dict]) -> int: + """Collapse malformed role-alternation left in the live history. + + Providers (OpenAI, OpenRouter, Anthropic) expect strict alternation: + after the system message, user/tool alternates with assistant, with + no two consecutive user messages and no tool-result that doesn't + follow an assistant-with-tool_calls. Violations cause silent empty + responses on most providers, which triggers the empty-retry loop. + + This runs right before the API call as a defensive belt — by the + time it fires, the scaffolding strip should already have prevented + most shapes, but external callers (gateway multi-queue replay, + session resume, cron, explicit conversation_history passed in by + host code) can feed in already-broken histories. + + Repairs applied: + 1. Stray ``tool`` messages whose ``tool_call_id`` doesn't match + any preceding assistant tool_call — dropped. + 2. Consecutive ``user`` messages — merged with newline separator + so no user input is lost. + + Deliberately does NOT rewind orphan ``assistant(tool_calls)+tool`` + pairs that precede a user message — that pattern IS valid when the + previous turn completed normally and the user jumped in to redirect + before the model got a continuation turn (the ongoing dialog + pattern). The empty-response scaffolding stripper handles the + genuinely-broken variant via its flag-gated rewind. + + Returns the number of repairs made (for logging/telemetry). + """ + if not messages: + return 0 + + repairs = 0 + + # Pass 1: drop stray tool messages that don't follow a known + # assistant tool_call_id. Uses a rolling set of known ids refreshed + # on each assistant message. + known_tool_ids: set = set() + filtered: List[Dict] = [] + for msg in messages: + if not isinstance(msg, dict): + filtered.append(msg) + continue + role = msg.get("role") + if role == "assistant": + known_tool_ids = set() + for tc in (msg.get("tool_calls") or []): + tc_id = tc.get("id") if isinstance(tc, dict) else None + if tc_id: + known_tool_ids.add(tc_id) + filtered.append(msg) + elif role == "tool": + tc_id = msg.get("tool_call_id") + if tc_id and tc_id in known_tool_ids: + filtered.append(msg) + else: + repairs += 1 + else: + if role == "user": + # A user turn closes the tool-result run; subsequent + # tool messages without a fresh assistant tool_call + # are orphans. + known_tool_ids = set() + filtered.append(msg) + + # Pass 2: merge consecutive user messages. Preserves all user input + # so nothing the user typed is lost. + merged: List[Dict] = [] + for msg in filtered: + if ( + merged + and isinstance(msg, dict) + and msg.get("role") == "user" + and isinstance(merged[-1], dict) + and merged[-1].get("role") == "user" + ): + prev = merged[-1] + prev_content = prev.get("content", "") + new_content = msg.get("content", "") + # Only merge plain-text content; leave multimodal (list) + # content alone — collapsing image/audio blocks risks + # mangling the attachment structure. + if isinstance(prev_content, str) and isinstance(new_content, str): + prev["content"] = ( + (prev_content + "\n\n" + new_content) + if prev_content and new_content + else (prev_content or new_content) + ) + repairs += 1 + continue + merged.append(msg) + + if repairs > 0: + # Rewrite in place so downstream paths (persistence, return + # value, session DB flush) see the repaired sequence. + messages[:] = merged + + return repairs + + + +def strip_think_blocks(agent, content: str) -> str: + """Remove reasoning/thinking blocks from content, returning only visible text. + + Handles four cases: + 1. Closed tag pairs (````) — the common path when + the provider emits complete reasoning blocks. + 2. Unterminated open tag at a block boundary (start of text or + after a newline) — e.g. MiniMax M2.7 / NIM endpoints where the + closing tag is dropped. Everything from the open tag to end + of string is stripped. The block-boundary check mirrors + ``gateway/stream_consumer.py``'s filter so models that mention + ```` in prose aren't over-stripped. + 3. Stray orphan open/close tags that slip through. + 4. Tag variants: ````, ````, ````, + ````, ```` (Gemma 4), all + case-insensitive. + + Additionally strips standalone tool-call XML blocks that some open + models (notably Gemma variants on OpenRouter) emit inside assistant + content instead of via the structured ``tool_calls`` field: + * ```` + * ```` + * ```` + * ```` + * ```` + * ```` (Gemma style) + Ported from openclaw/openclaw#67318. The ```` variant is + boundary-gated (only strips when the tag sits at start-of-line or + after punctuation and carries a ``name="..."`` attribute) so prose + mentions like "Use in JavaScript" are preserved. + """ + if not content: + return "" + # 1. Closed tag pairs — case-insensitive for all variants so + # mixed-case tags (, ) don't slip through to + # the unterminated-tag pass and take trailing content with them. + content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) + content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) + content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) + content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) + content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) + # 1b. Tool-call XML blocks (openclaw/openclaw#67318). Handle the + # generic tag names first — they have no attribute gating since + # a literal in prose is already vanishingly rare. + for _tc_name in ("tool_call", "tool_calls", "tool_result", + "function_call", "function_calls"): + content = re.sub( + rf'<{_tc_name}\b[^>]*>.*?', + '', + content, + flags=re.DOTALL | re.IGNORECASE, + ) + # 1c. ... — Gemma-style standalone + # tool call. Only strip when the tag sits at a block boundary + # (start of text, after a newline, or after sentence-ending + # punctuation) AND carries a name="..." attribute. This keeps + # prose mentions like "Use to declare" safe. + content = re.sub( + r'(?:(?<=^)|(?<=[\n\r.!?:]))[ \t]*' + r']*\bname\s*=[^>]*>' + r'(?:(?:(?!).)*)', + '', + content, + flags=re.DOTALL | re.IGNORECASE, + ) + # 2. Unterminated reasoning block — open tag at a block boundary + # (start of text, or after a newline) with no matching close. + # Strip from the tag to end of string. Fixes #8878 / #9568 + # (MiniMax M2.7 leaking raw reasoning into assistant content). + content = re.sub( + r'(?:^|\n)[ \t]*<(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)\b[^>]*>.*$', + '', + content, + flags=re.DOTALL | re.IGNORECASE, + ) + # 3. Stray orphan open/close tags that slipped through. + content = re.sub( + r'\s*', + '', + content, + flags=re.IGNORECASE, + ) + # 3b. Stray tool-call closers. (We do NOT strip bare or + # unterminated because a truncated tail + # during streaming may still be valuable to the user; matches + # OpenClaw's intentional asymmetry.) + content = re.sub( + r'\s*', + '', + content, + flags=re.IGNORECASE, + ) + return content + + + +def recover_with_credential_pool( + agent, + *, + status_code: Optional[int], + has_retried_429: bool, + classified_reason: Optional[FailoverReason] = None, + error_context: Optional[Dict[str, Any]] = None, +) -> tuple[bool, bool]: + """Attempt credential recovery via pool rotation. + + Returns (recovered, has_retried_429). + On rate limits: first occurrence retries same credential (sets flag True). + second consecutive failure rotates to next credential. + On billing exhaustion: immediately rotates. + On auth failures: attempts token refresh before rotating. + + `classified_reason` lets the recovery path honor the structured error + classifier instead of relying only on raw HTTP codes. This matters for + providers that surface billing/rate-limit/auth conditions under a + different status code, such as Anthropic returning HTTP 400 for + "out of extra usage". + """ + pool = agent._credential_pool + if pool is None: + return False, has_retried_429 + + effective_reason = classified_reason + if effective_reason is None: + if status_code == 402: + effective_reason = FailoverReason.billing + elif status_code == 429: + effective_reason = FailoverReason.rate_limit + elif status_code in {401, 403}: + effective_reason = FailoverReason.auth + + if effective_reason == FailoverReason.billing: + rotate_status = status_code if status_code is not None else 402 + next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context) + if next_entry is not None: + _ra().logger.info( + "Credential %s (billing) — rotated to pool entry %s", + rotate_status, + getattr(next_entry, "id", "?"), + ) + agent._swap_credential(next_entry) + return True, False + return False, has_retried_429 + + if effective_reason == FailoverReason.rate_limit: + usage_limit_reached = False + if error_context: + context_reason = str(error_context.get("reason") or "").lower() + context_message = str(error_context.get("message") or "").lower() + usage_limit_reached = ( + "usage_limit_reached" in context_reason + or "usage limit has been reached" in context_message + ) + if not has_retried_429 and not usage_limit_reached: + return False, True + rotate_status = status_code if status_code is not None else 429 + next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context) + if next_entry is not None: + _ra().logger.info( + "Credential %s (rate limit) — rotated to pool entry %s", + rotate_status, + getattr(next_entry, "id", "?"), + ) + agent._swap_credential(next_entry) + return True, False + return False, True + + if effective_reason == FailoverReason.auth: + if agent._is_entitlement_failure(error_context, status_code): + _ra().logger.info( + "Credential %s — entitlement-shaped 403 from %s; " + "skipping pool refresh (account lacks subscription, " + "not a transient auth failure).", + status_code if status_code is not None else "auth", + agent.provider or "provider", + ) + return False, has_retried_429 + refreshed = pool.try_refresh_current() + if refreshed is not None: + _ra().logger.info(f"Credential auth failure — refreshed pool entry {getattr(refreshed, 'id', '?')}") + agent._swap_credential(refreshed) + return True, has_retried_429 + # Refresh failed — rotate to next credential instead of giving up. + # The failed entry is already marked exhausted by try_refresh_current(). + rotate_status = status_code if status_code is not None else 401 + next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context) + if next_entry is not None: + _ra().logger.info( + "Credential %s (auth refresh failed) — rotated to pool entry %s", + rotate_status, + getattr(next_entry, "id", "?"), + ) + agent._swap_credential(next_entry) + return True, False + + return False, has_retried_429 + + + +def try_recover_primary_transport( + agent, api_error: Exception, *, retry_count: int, max_retries: int, +) -> bool: + """Attempt one extra primary-provider recovery cycle for transient transport failures. + + After ``max_retries`` exhaust, rebuild the primary client (clearing + stale connection pools) and give it one more attempt before falling + back. This is most useful for direct endpoints (custom, Z.AI, + Anthropic, OpenAI, local models) where a TCP-level hiccup does not + mean the provider is down. + + Skipped for proxy/aggregator providers (OpenRouter, Nous) which + already manage connection pools and retries server-side — if our + retries through them are exhausted, one more rebuilt client won't help. + """ + if agent._fallback_activated: + return False + + # Only for transient transport errors + error_type = type(api_error).__name__ + if error_type not in _TRANSIENT_TRANSPORT_ERRORS: + return False + + # Skip for aggregator providers — they manage their own retry infra + if agent._is_openrouter_url(): + return False + provider_lower = (agent.provider or "").strip().lower() + if provider_lower in {"nous", "nous-research"}: + return False + + try: + # Close existing client to release stale connections + if getattr(agent, "client", None) is not None: + try: + agent._close_openai_client( + agent.client, reason="primary_recovery", shared=True, + ) + except Exception: + pass + + # Rebuild from primary snapshot + rt = agent._primary_runtime + agent._client_kwargs = dict(rt["client_kwargs"]) + agent.model = rt["model"] + agent.provider = rt["provider"] + agent.base_url = rt["base_url"] + agent.api_mode = rt["api_mode"] + if hasattr(agent, "_transport_cache"): + agent._transport_cache.clear() + agent.api_key = rt["api_key"] + + if agent.api_mode == "anthropic_messages": + from agent.anthropic_adapter import build_anthropic_client + agent._anthropic_api_key = rt["anthropic_api_key"] + agent._anthropic_base_url = rt["anthropic_base_url"] + agent._anthropic_client = build_anthropic_client( + rt["anthropic_api_key"], rt["anthropic_base_url"], + timeout=get_provider_request_timeout(agent.provider, agent.model), + ) + agent._is_anthropic_oauth = rt["is_anthropic_oauth"] + agent.client = None + else: + agent.client = agent._create_openai_client( + dict(rt["client_kwargs"]), + reason="primary_recovery", + shared=True, + ) + + wait_time = min(3 + retry_count, 8) + agent._vprint( + f"{agent.log_prefix}🔁 Transient {error_type} on {agent.provider} — " + f"rebuilt client, waiting {wait_time}s before one last primary attempt.", + force=True, + ) + time.sleep(wait_time) + return True + except Exception as e: + logging.warning("Primary transport recovery failed: %s", e) + return False + +# ── End provider fallback ────────────────────────────────────────────── + + + +def drop_thinking_only_and_merge_users( + messages: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Drop thinking-only assistant turns; merge any adjacent user messages left behind. + + Runs on the per-call ``api_messages`` copy only. The stored + conversation history (``agent.messages``) is never mutated, so the + user still sees the thinking block in the CLI/gateway transcript and + session persistence keeps the full trace. Only the wire copy sent to + the provider is cleaned. + + Why drop-and-merge rather than inject stub text: + - Fabricating ``"."`` / ``"(continued)"`` text lies in the history + and makes future turns see model output the model didn't emit. + - Dropping the turn preserves honesty; merging adjacent user messages + preserves the provider's role-alternation invariant. + - This is the pattern used by Claude Code's ``normalizeMessagesForAPI`` + (filterOrphanedThinkingOnlyMessages + mergeAdjacentUserMessages). + """ + if not messages: + return messages + + # Pass 1: drop thinking-only assistant turns. + kept = [m for m in messages if not _ra().AIAgent._is_thinking_only_assistant(m)] + dropped = len(messages) - len(kept) + if dropped == 0: + return messages + + # Pass 2: merge any newly-adjacent user messages. + merged: List[Dict[str, Any]] = [] + merges = 0 + for m in kept: + prev = merged[-1] if merged else None + if ( + prev is not None + and prev.get("role") == "user" + and m.get("role") == "user" + ): + prev_content = prev.get("content", "") + cur_content = m.get("content", "") + # Work on a copy of ``prev`` so the caller's input dicts are + # never mutated. ``_sanitize_api_messages`` upstream already + # hands us per-call copies, but staying pure here means we + # can be called safely from anywhere (tests, other loops). + prev_copy = dict(prev) + # Only string-content merge is meaningful for role-alternation + # purposes. If either side is a list (multimodal), append as a + # separate block rather than collapsing. + if isinstance(prev_content, str) and isinstance(cur_content, str): + sep = "\n\n" if prev_content and cur_content else "" + prev_copy["content"] = prev_content + sep + cur_content + elif isinstance(prev_content, list) and isinstance(cur_content, list): + prev_copy["content"] = list(prev_content) + list(cur_content) + elif isinstance(prev_content, list) and isinstance(cur_content, str): + if cur_content: + prev_copy["content"] = list(prev_content) + [ + {"type": "text", "text": cur_content} + ] + else: + prev_copy["content"] = list(prev_content) + elif isinstance(prev_content, str) and isinstance(cur_content, list): + new_blocks: List[Dict[str, Any]] = [] + if prev_content: + new_blocks.append({"type": "text", "text": prev_content}) + new_blocks.extend(cur_content) + prev_copy["content"] = new_blocks + else: + # Unknown content shape — fall back to appending separately + # (violates alternation, but safer than raising in a hot path). + merged.append(m) + continue + merged[-1] = prev_copy + merges += 1 + else: + merged.append(m) + + _ra().logger.debug( + "Pre-call sanitizer: dropped %d thinking-only assistant turn(s), " + "merged %d adjacent user message(s)", + dropped, + merges, + ) + return merged + + + +def restore_primary_runtime(agent) -> bool: + """Restore the primary runtime at the start of a new turn. + + In long-lived CLI sessions a single AIAgent instance spans multiple + turns. Without restoration, one transient failure pins the session + to the fallback provider for every subsequent turn. Calling this at + the top of ``run_conversation()`` makes fallback turn-scoped. + + The gateway caches agents across messages (``_agent_cache`` in + ``gateway/run.py``), so this restoration IS needed there too. + """ + if not agent._fallback_activated: + # Reset the chain index even when no fallback was activated this + # turn. Without this, a turn where _try_activate_fallback() was + # called but returned False (chain exhausted or provider not + # configured) leaves _fallback_index >= len(_fallback_chain) while + # _fallback_activated stays False. The next turn skips this block + # entirely, stranding the index and silently blocking all future + # fallback attempts for the session. Fixes #20465. + agent._fallback_index = 0 + return False + + if getattr(agent, "_rate_limited_until", 0) > time.monotonic(): + return False # primary still in rate-limit cooldown, stay on fallback + + rt = agent._primary_runtime + try: + # ── Core runtime state ── + agent.model = rt["model"] + agent.provider = rt["provider"] + agent.base_url = rt["base_url"] # setter updates _base_url_lower + agent.api_mode = rt["api_mode"] + if hasattr(agent, "_transport_cache"): + agent._transport_cache.clear() + agent.api_key = rt["api_key"] + agent._client_kwargs = dict(rt["client_kwargs"]) + agent._use_prompt_caching = rt["use_prompt_caching"] + # Default to native layout when the restored snapshot predates the + # native-vs-proxy split (older sessions saved before this PR). + agent._use_native_cache_layout = rt.get( + "use_native_cache_layout", + agent.api_mode == "anthropic_messages" and agent.provider == "anthropic", + ) + + # ── Rebuild client for the primary provider ── + if agent.api_mode == "anthropic_messages": + from agent.anthropic_adapter import build_anthropic_client + agent._anthropic_api_key = rt["anthropic_api_key"] + agent._anthropic_base_url = rt["anthropic_base_url"] + agent._anthropic_client = build_anthropic_client( + rt["anthropic_api_key"], rt["anthropic_base_url"], + timeout=get_provider_request_timeout(agent.provider, agent.model), + ) + agent._is_anthropic_oauth = rt["is_anthropic_oauth"] + agent.client = None + else: + agent.client = agent._create_openai_client( + dict(rt["client_kwargs"]), + reason="restore_primary", + shared=True, + ) + + # ── Restore context engine state ── + cc = agent.context_compressor + cc.update_model( + model=rt["compressor_model"], + context_length=rt["compressor_context_length"], + base_url=rt["compressor_base_url"], + api_key=rt["compressor_api_key"], + provider=rt["compressor_provider"], + ) + + # ── Reset fallback chain for the new turn ── + agent._fallback_activated = False + agent._fallback_index = 0 + + logging.info( + "Primary runtime restored for new turn: %s (%s)", + agent.model, agent.provider, + ) + return True + except Exception as e: + logging.warning("Failed to restore primary runtime: %s", e) + return False + +# Which error types indicate a transient transport failure worth +# one more attempt with a rebuilt client / connection pool. +_TRANSIENT_TRANSPORT_ERRORS = frozenset({ + "ReadTimeout", "ConnectTimeout", "PoolTimeout", + "ConnectError", "RemoteProtocolError", + "APIConnectionError", "APITimeoutError", +}) + + + +def extract_reasoning(agent, assistant_message) -> Optional[str]: + """ + Extract reasoning/thinking content from an assistant message. + + OpenRouter and various providers can return reasoning in multiple formats: + 1. message.reasoning - Direct reasoning field (DeepSeek, Qwen, etc.) + 2. message.reasoning_content - Alternative field (Moonshot AI, Novita, etc.) + 3. message.reasoning_details - Array of {type, summary, ...} objects (OpenRouter unified) + + Args: + assistant_message: The assistant message object from the API response + + Returns: + Combined reasoning text, or None if no reasoning found + """ + reasoning_parts = [] + + # Check direct reasoning field + if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning: + reasoning_parts.append(assistant_message.reasoning) + + # Check reasoning_content field (alternative name used by some providers) + if hasattr(assistant_message, 'reasoning_content') and assistant_message.reasoning_content: + # Don't duplicate if same as reasoning + if assistant_message.reasoning_content not in reasoning_parts: + reasoning_parts.append(assistant_message.reasoning_content) + + # Check reasoning_details array (OpenRouter unified format) + # Format: [{"type": "reasoning.summary", "summary": "...", ...}, ...] + if hasattr(assistant_message, 'reasoning_details') and assistant_message.reasoning_details: + for detail in assistant_message.reasoning_details: + if isinstance(detail, dict): + # Extract summary from reasoning detail object + summary = ( + detail.get('summary') + or detail.get('thinking') + or detail.get('content') + or detail.get('text') + ) + if summary and summary not in reasoning_parts: + reasoning_parts.append(summary) + + # Some providers embed reasoning directly inside assistant content + # instead of returning structured reasoning fields. Only fall back + # to inline extraction when no structured reasoning was found. + content = getattr(assistant_message, "content", None) + if not reasoning_parts and isinstance(content, list): + # DeepSeek V4 Pro (and compatible providers) return content as a + # list of typed blocks, e.g.: + # [{"type": "thinking", "thinking": "..."}, {"type": "output", ...}] + # Without this branch the thinking text is silently dropped and the + # next turn fails with HTTP 400 ("thinking must be passed back"). + # Refs #21944. + for block in content: + if isinstance(block, dict) and block.get("type") == "thinking": + thinking_text = block.get("thinking") or block.get("text") or "" + thinking_text = thinking_text.strip() + if thinking_text and thinking_text not in reasoning_parts: + reasoning_parts.append(thinking_text) + if not reasoning_parts and isinstance(content, str) and content: + inline_patterns = ( + r"(.*?)", + r"(.*?)", + r"(.*?)", + r"(.*?)", + r"(.*?)", + ) + for pattern in inline_patterns: + flags = re.DOTALL | re.IGNORECASE + for block in re.findall(pattern, content, flags=flags): + cleaned = block.strip() + if cleaned and cleaned not in reasoning_parts: + reasoning_parts.append(cleaned) + + # Combine all reasoning parts + if reasoning_parts: + return "\n\n".join(reasoning_parts) + + return None + + + +def dump_api_request_debug( + agent, + api_kwargs: Dict[str, Any], + *, + reason: str, + error: Optional[Exception] = None, +) -> Optional[Path]: + """ + Dump a debug-friendly HTTP request record for the active inference API. + + Captures the request body from api_kwargs (excluding transport-only keys + like timeout). Intended for debugging provider-side 4xx failures where + retries are not useful. + """ + try: + body = copy.deepcopy(api_kwargs) + body.pop("timeout", None) + body = {k: v for k, v in body.items() if v is not None} + + api_key = None + try: + api_key = getattr(agent.client, "api_key", None) + except Exception as e: + _ra().logger.debug("Could not extract API key for debug dump: %s", e) + + dump_payload: Dict[str, Any] = { + "timestamp": datetime.now().isoformat(), + "session_id": agent.session_id, + "reason": reason, + "request": { + "method": "POST", + "url": f"{agent.base_url.rstrip('/')}{'/responses' if agent.api_mode == 'codex_responses' else '/chat/completions'}", + "headers": { + "Authorization": f"Bearer {agent._mask_api_key_for_logs(api_key)}", + "Content-Type": "application/json", + }, + "body": body, + }, + } + + if error is not None: + error_info: Dict[str, Any] = { + "type": type(error).__name__, + "message": str(error), + } + for attr_name in ("status_code", "request_id", "code", "param", "type"): + attr_value = getattr(error, attr_name, None) + if attr_value is not None: + error_info[attr_name] = attr_value + + body_attr = getattr(error, "body", None) + if body_attr is not None: + error_info["body"] = body_attr + + response_obj = getattr(error, "response", None) + if response_obj is not None: + try: + error_info["response_status"] = getattr(response_obj, "status_code", None) + error_info["response_text"] = response_obj.text + except Exception as e: + _ra().logger.debug("Could not extract error response details: %s", e) + + dump_payload["error"] = error_info + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") + dump_file = agent.logs_dir / f"request_dump_{agent.session_id}_{timestamp}.json" + dump_file.write_text( + json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str), + encoding="utf-8", + ) + + agent._vprint(f"{agent.log_prefix}🧾 Request debug dump written to: {dump_file}") + + if env_var_enabled("HERMES_DUMP_REQUEST_STDOUT"): + print(json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str)) + + return dump_file + except Exception as dump_error: + if agent.verbose_logging: + logging.warning(f"Failed to dump API request debug payload: {dump_error}") + return None + + + +def anthropic_prompt_cache_policy( + agent, + *, + provider: Optional[str] = None, + base_url: Optional[str] = None, + api_mode: Optional[str] = None, + model: Optional[str] = None, +) -> tuple[bool, bool]: + """Decide whether to apply Anthropic prompt caching and which layout to use. + + Returns ``(should_cache, use_native_layout)``: + * ``should_cache`` — inject ``cache_control`` breakpoints for this + request (applies to OpenRouter Claude, native Anthropic, and + third-party gateways that speak the native Anthropic protocol). + * ``use_native_layout`` — place markers on the *inner* content + blocks (native Anthropic accepts and requires this layout); + when False markers go on the message envelope (OpenRouter and + OpenAI-wire proxies expect the looser layout). + + Third-party providers using the native Anthropic transport + (``api_mode == 'anthropic_messages'`` + Claude-named model) get + caching with the native layout so they benefit from the same + cost reduction as direct Anthropic callers, provided their + gateway implements the Anthropic cache_control contract + (MiniMax, Zhipu GLM, LiteLLM's Anthropic proxy mode all do). + + Qwen / Alibaba-family models on OpenCode, OpenCode Go, and direct + Alibaba (DashScope) also honour Anthropic-style ``cache_control`` + markers on OpenAI-wire chat completions. Upstream pi-mono #3392 / + pi #3393 documented this for opencode-go Qwen. Without markers + these providers serve zero cache hits, re-billing the full prompt + on every turn. + """ + eff_provider = (provider if provider is not None else agent.provider) or "" + eff_base_url = base_url if base_url is not None else (agent.base_url or "") + eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "") + eff_model = (model if model is not None else agent.model) or "" + + model_lower = eff_model.lower() + provider_lower = eff_provider.lower() + is_claude = "claude" in model_lower + is_openrouter = base_url_host_matches(eff_base_url, "openrouter.ai") + # Nous Portal proxies to OpenRouter behind the scenes — identical + # OpenAI-wire envelope cache_control semantics. Treat it as an + # OpenRouter-equivalent endpoint for caching layout purposes. + is_nous_portal = "nousresearch" in eff_base_url.lower() + is_anthropic_wire = eff_api_mode == "anthropic_messages" + is_native_anthropic = ( + is_anthropic_wire + and (eff_provider == "anthropic" or base_url_hostname(eff_base_url) == "api.anthropic.com") + ) + + if is_native_anthropic: + return True, True + if (is_openrouter or is_nous_portal) and is_claude: + return True, False + # Nous Portal Qwen (e.g. qwen3.6-plus) takes the same envelope-layout + # cache_control path as Portal Claude. Portal proxies to OpenRouter + # and the upstream Qwen route accepts cache_control markers; without + # this branch the alibaba-family check below only matches + # provider=opencode/alibaba and Portal traffic falls through to + # (False, False), serving 0% cache hits and re-billing the full + # prompt on every turn. + if is_nous_portal and "qwen" in model_lower: + return True, False + if is_anthropic_wire and is_claude: + # Third-party Anthropic-compatible gateway. + return True, True + + # MiniMax on its Anthropic-compatible endpoint serves its own + # model family (MiniMax-M2.7, M2.5, M2.1, M2) with documented + # cache_control support (0.1× read pricing, 5-minute TTL). The + # blanket is_claude gate above excludes these — opt them in + # explicitly via provider id or host match so users on + # provider=minimax / minimax-cn (or custom endpoints pointing at + # api.minimax.io/anthropic / api.minimaxi.com/anthropic) get the + # same cost reduction as Claude traffic. + # Docs: https://platform.minimax.io/docs/api-reference/anthropic-api-compatible-cache + if is_anthropic_wire: + is_minimax_provider = provider_lower in {"minimax", "minimax-cn"} + is_minimax_host = ( + base_url_host_matches(eff_base_url, "api.minimax.io") + or base_url_host_matches(eff_base_url, "api.minimaxi.com") + ) + if is_minimax_provider or is_minimax_host: + return True, True + + # Qwen/Alibaba on OpenCode (Zen/Go) and native DashScope: OpenAI-wire + # transport that accepts Anthropic-style cache_control markers and + # rewards them with real cache hits. Without this branch + # qwen3.6-plus on opencode-go reports 0% cached tokens and burns + # through the subscription on every turn. + model_is_qwen = "qwen" in model_lower + provider_is_alibaba_family = provider_lower in { + "opencode", "opencode-zen", "opencode-go", "alibaba", + } + if provider_is_alibaba_family and model_is_qwen: + # Envelope layout (native_anthropic=False): markers on inner + # content parts, not top-level tool messages. Matches + # pi-mono's "alibaba" cacheControlFormat. + return True, False + + return False, False + + + +def create_openai_client(agent, client_kwargs: dict, *, reason: str, shared: bool) -> Any: + from agent.auxiliary_client import _validate_base_url, _validate_proxy_env_urls + # Treat client_kwargs as read-only. Callers pass agent._client_kwargs (or shallow + # copies of it) in; any in-place mutation leaks back into the stored dict and is + # reused on subsequent requests. #10933 hit this by injecting an httpx.Client + # transport that was torn down after the first request, so the next request + # wrapped a closed transport and raised "Cannot send a request, as the client + # has been closed" on every retry. The revert resolved that specific path; this + # copy locks the contract so future transport/keepalive work can't reintroduce + # the same class of bug. + client_kwargs = dict(client_kwargs) + _validate_proxy_env_urls() + _validate_base_url(client_kwargs.get("base_url")) + if agent.provider == "copilot-acp" or str(client_kwargs.get("base_url", "")).startswith("acp://copilot"): + from agent.copilot_acp_client import CopilotACPClient + + client = CopilotACPClient(**client_kwargs) + _ra().logger.info( + "Copilot ACP client created (%s, shared=%s) %s", + reason, + shared, + agent._client_log_context(), + ) + return client + if agent.provider == "google-gemini-cli" or str(client_kwargs.get("base_url", "")).startswith("cloudcode-pa://"): + from agent.gemini_cloudcode_adapter import GeminiCloudCodeClient + + # Strip OpenAI-specific kwargs the Gemini client doesn't accept + safe_kwargs = { + k: v for k, v in client_kwargs.items() + if k in {"api_key", "base_url", "default_headers", "project_id", "timeout"} + } + client = GeminiCloudCodeClient(**safe_kwargs) + _ra().logger.info( + "Gemini Cloud Code Assist client created (%s, shared=%s) %s", + reason, + shared, + agent._client_log_context(), + ) + return client + if agent.provider == "gemini": + from agent.gemini_native_adapter import GeminiNativeClient, is_native_gemini_base_url + + base_url = str(client_kwargs.get("base_url", "") or "") + if is_native_gemini_base_url(base_url): + safe_kwargs = { + k: v for k, v in client_kwargs.items() + if k in {"api_key", "base_url", "default_headers", "timeout", "http_client"} + } + if "http_client" not in safe_kwargs: + keepalive_http = agent._build_keepalive_http_client(base_url) + if keepalive_http is not None: + safe_kwargs["http_client"] = keepalive_http + client = GeminiNativeClient(**safe_kwargs) + _ra().logger.info( + "Gemini native client created (%s, shared=%s) %s", + reason, + shared, + agent._client_log_context(), + ) + return client + # Inject TCP keepalives so the kernel detects dead provider connections + # instead of letting them sit silently in CLOSE-WAIT (#10324). Without + # this, a peer that drops mid-stream leaves the socket in a state where + # epoll_wait never fires, ``httpx`` read timeout may not trigger, and + # the agent hangs until manually killed. Probes after 30s idle, retry + # every 10s, give up after 3 → dead peer detected within ~60s. + # + # Safety against #10933: the ``client_kwargs = dict(client_kwargs)`` + # above means this injection only lands in the local per-call copy, + # never back into ``agent._client_kwargs``. Each ``_create_openai_client`` + # invocation therefore gets its OWN fresh ``httpx.Client`` whose + # lifetime is tied to the OpenAI client it is passed to. When the + # OpenAI client is closed (rebuild, teardown, credential rotation), + # the paired ``httpx.Client`` closes with it, and the next call + # constructs a fresh one — no stale closed transport can be reused. + # Tests in ``tests/run_agent/test_create_openai_client_reuse.py`` and + # ``tests/run_agent/test_sequential_chats_live.py`` pin this invariant. + if "http_client" not in client_kwargs: + keepalive_http = agent._build_keepalive_http_client(client_kwargs.get("base_url", "")) + if keepalive_http is not None: + client_kwargs["http_client"] = keepalive_http + # Uses the module-level `OpenAI` name, resolved lazily on first + # access via __getattr__ below. Tests patch via `run_agent.OpenAI`. + client = _ra().OpenAI(**client_kwargs) + _ra().logger.info( + "OpenAI client created (%s, shared=%s) %s", + reason, + shared, + agent._client_log_context(), + ) + return client + + +def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mode=''): + """Switch the model/provider in-place for a live agent. + + Called by the /model command handlers (CLI and gateway) after + ``model_switch.switch_model()`` has resolved credentials and + validated the model. This method performs the actual runtime + swap: rebuilding clients, updating caching flags, and refreshing + the context compressor. + + The implementation mirrors ``_try_activate_fallback()`` for the + client-swap logic but also updates ``_primary_runtime`` so the + change persists across turns (unlike fallback which is + turn-scoped). + """ + from hermes_cli.providers import determine_api_mode + + # ── Determine api_mode if not provided ── + if not api_mode: + api_mode = determine_api_mode(new_provider, base_url) + + # Defense-in-depth: ensure OpenCode base_url doesn't carry a trailing + # /v1 into the anthropic_messages client, which would cause the SDK to + # hit /v1/v1/messages. `model_switch.switch_model()` already strips + # this, but we guard here so any direct callers (future code paths, + # tests) can't reintroduce the double-/v1 404 bug. + if ( + api_mode == "anthropic_messages" + and new_provider in {"opencode-zen", "opencode-go"} + and isinstance(base_url, str) + and base_url + ): + base_url = re.sub(r"/v1/?$", "", base_url) + + old_model = agent.model + old_provider = agent.provider + + # Clear the per-config context_length override so the new model's + # actual context window is resolved via get_model_context_length() + # instead of inheriting the stale value from the previous model. + agent._config_context_length = None + + # ── Swap core runtime fields ── + agent.model = new_model + agent.provider = new_provider + # Use new base_url when provided; only fall back to current when the + # new provider genuinely has no endpoint (e.g. native SDK providers). + # Without this guard the old provider's URL (e.g. Ollama's localhost + # address) would persist silently after switching to a cloud provider + # that returns an empty base_url string. + if base_url: + agent.base_url = base_url + agent.api_mode = api_mode + # Invalidate transport cache — new api_mode may need a different transport + if hasattr(agent, "_transport_cache"): + agent._transport_cache.clear() + if api_key: + agent.api_key = api_key + + # ── Build new client ── + if api_mode == "anthropic_messages": + from agent.anthropic_adapter import ( + build_anthropic_client, + resolve_anthropic_token, + _is_oauth_token, + ) + # Only fall back to ANTHROPIC_TOKEN when the provider is actually Anthropic. + # Other anthropic_messages providers (MiniMax, Alibaba, etc.) must use their own + # API key — falling back would send Anthropic credentials to third-party endpoints. + _is_native_anthropic = new_provider == "anthropic" + effective_key = (api_key or agent.api_key or resolve_anthropic_token() or "") if _is_native_anthropic else (api_key or agent.api_key or "") + agent.api_key = effective_key + agent._anthropic_api_key = effective_key + agent._anthropic_base_url = base_url or getattr(agent, "_anthropic_base_url", None) + agent._anthropic_client = build_anthropic_client( + effective_key, agent._anthropic_base_url, + timeout=get_provider_request_timeout(agent.provider, agent.model), + ) + agent._is_anthropic_oauth = _is_oauth_token(effective_key) if _is_native_anthropic else False + agent.client = None + agent._client_kwargs = {} + else: + effective_key = api_key or agent.api_key + effective_base = base_url or agent.base_url + agent._client_kwargs = { + "api_key": effective_key, + "base_url": effective_base, + } + _sm_timeout = get_provider_request_timeout(agent.provider, agent.model) + if _sm_timeout is not None: + agent._client_kwargs["timeout"] = _sm_timeout + agent.client = agent._create_openai_client( + dict(agent._client_kwargs), + reason="switch_model", + shared=True, + ) + + # ── Re-evaluate prompt caching ── + agent._use_prompt_caching, agent._use_native_cache_layout = ( + agent._anthropic_prompt_cache_policy( + provider=new_provider, + base_url=agent.base_url, + api_mode=api_mode, + model=new_model, + ) + ) + + # ── LM Studio: preload before probing context length ── + agent._ensure_lmstudio_runtime_loaded() + + # ── Update context compressor ── + if hasattr(agent, "context_compressor") and agent.context_compressor: + from agent.model_metadata import get_model_context_length + # Re-read custom_providers from live config so per-model + # context_length overrides are honored when switching to a + # custom provider mid-session (closes #15779). + _sm_custom_providers = None + try: + from hermes_cli.config import load_config, get_compatible_custom_providers + _sm_cfg = load_config() + _sm_custom_providers = get_compatible_custom_providers(_sm_cfg) + except Exception: + _sm_custom_providers = None + new_context_length = get_model_context_length( + agent.model, + base_url=agent.base_url, + api_key=agent.api_key, + provider=agent.provider, + config_context_length=getattr(agent, "_config_context_length", None), + custom_providers=_sm_custom_providers, + ) + agent.context_compressor.update_model( + model=agent.model, + context_length=new_context_length, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + provider=agent.provider, + api_mode=agent.api_mode, + ) + + # ── Invalidate cached system prompt so it rebuilds next turn ── + agent._cached_system_prompt = None + + # ── Update _primary_runtime so the change persists across turns ── + _cc = agent.context_compressor if hasattr(agent, "context_compressor") and agent.context_compressor else None + agent._primary_runtime = { + "model": agent.model, + "provider": agent.provider, + "base_url": agent.base_url, + "api_mode": agent.api_mode, + "api_key": getattr(agent, "api_key", ""), + "client_kwargs": dict(agent._client_kwargs), + "use_prompt_caching": agent._use_prompt_caching, + "use_native_cache_layout": agent._use_native_cache_layout, + "compressor_model": getattr(_cc, "model", agent.model) if _cc else agent.model, + "compressor_base_url": getattr(_cc, "base_url", agent.base_url) if _cc else agent.base_url, + "compressor_api_key": getattr(_cc, "api_key", "") if _cc else "", + "compressor_provider": getattr(_cc, "provider", agent.provider) if _cc else agent.provider, + "compressor_context_length": _cc.context_length if _cc else 0, + "compressor_threshold_tokens": _cc.threshold_tokens if _cc else 0, + } + if api_mode == "anthropic_messages": + agent._primary_runtime.update({ + "anthropic_api_key": agent._anthropic_api_key, + "anthropic_base_url": agent._anthropic_base_url, + "is_anthropic_oauth": agent._is_anthropic_oauth, + }) + + # ── Reset fallback state ── + agent._fallback_activated = False + agent._fallback_index = 0 + + # When the user deliberately swaps primary providers (e.g. openrouter + # → anthropic), drop any fallback entries that target the OLD primary + # or the NEW one. The chain was seeded from config at agent init for + # the original provider — without pruning, a failed turn on the new + # primary silently re-activates the provider the user just rejected, + # which is exactly what was reported during TUI v2 blitz testing + # ("switched to anthropic, tui keeps trying openrouter"). + old_norm = (old_provider or "").strip().lower() + new_norm = (new_provider or "").strip().lower() + fallback_chain = list(getattr(agent, "_fallback_chain", []) or []) + if old_norm and new_norm and old_norm != new_norm: + fallback_chain = [ + entry for entry in fallback_chain + if (entry.get("provider") or "").strip().lower() not in {old_norm, new_norm} + ] + agent._fallback_chain = fallback_chain + agent._fallback_model = fallback_chain[0] if fallback_chain else None + + logging.info( + "Model switched in-place: %s (%s) -> %s (%s)", + old_model, old_provider, new_model, new_provider, + ) + + + +def invoke_tool(agent, function_name: str, function_args: dict, effective_task_id: str, + tool_call_id: Optional[str] = None, messages: list = None, + pre_tool_block_checked: bool = False) -> str: + """Invoke a single tool and return the result string. No display logic. + + Handles both agent-level tools (todo, memory, etc.) and registry-dispatched + tools. Used by the concurrent execution path; the sequential path retains + its own inline invocation for backward-compatible display handling. + """ + # Check plugin hooks for a block directive before executing anything. + block_message: Optional[str] = None + if not pre_tool_block_checked: + try: + from hermes_cli.plugins import get_pre_tool_call_block_message + block_message = get_pre_tool_call_block_message( + function_name, function_args, task_id=effective_task_id or "", + ) + except Exception: + pass + if block_message is not None: + return json.dumps({"error": block_message}, ensure_ascii=False) + + if function_name == "todo": + from tools.todo_tool import todo_tool as _todo_tool + return _todo_tool( + todos=function_args.get("todos"), + merge=function_args.get("merge", False), + store=agent._todo_store, + ) + elif function_name == "session_search": + session_db = agent._get_session_db_for_recall() + if not session_db: + from hermes_state import format_session_db_unavailable + return json.dumps({"success": False, "error": format_session_db_unavailable()}) + from tools.session_search_tool import session_search as _session_search + return _session_search( + query=function_args.get("query", ""), + role_filter=function_args.get("role_filter"), + limit=function_args.get("limit", 3), + db=session_db, + current_session_id=agent.session_id, + ) + elif function_name == "memory": + target = function_args.get("target", "memory") + from tools.memory_tool import memory_tool as _memory_tool + result = _memory_tool( + action=function_args.get("action"), + target=target, + content=function_args.get("content"), + old_text=function_args.get("old_text"), + store=agent._memory_store, + ) + # Bridge: notify external memory provider of built-in memory writes + if agent._memory_manager and function_args.get("action") in {"add", "replace"}: + try: + agent._memory_manager.on_memory_write( + function_args.get("action", ""), + target, + function_args.get("content", ""), + metadata=agent._build_memory_write_metadata( + task_id=effective_task_id, + tool_call_id=tool_call_id, + ), + ) + except Exception: + pass + return result + elif agent._memory_manager and agent._memory_manager.has_tool(function_name): + return agent._memory_manager.handle_tool_call(function_name, function_args) + elif function_name == "clarify": + from tools.clarify_tool import clarify_tool as _clarify_tool + return _clarify_tool( + question=function_args.get("question", ""), + choices=function_args.get("choices"), + callback=agent.clarify_callback, + ) + elif function_name == "delegate_task": + return agent._dispatch_delegate_task(function_args) + else: + return _ra().handle_function_call( + function_name, function_args, effective_task_id, + tool_call_id=tool_call_id, + session_id=agent.session_id or "", + enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None, + skip_pre_tool_call_hook=True, + ) + + + +def repair_tool_call(agent, tool_name: str) -> str | None: + """Attempt to repair a mismatched tool name before aborting. + + Models sometimes emit variants of a tool name that differ only + in casing, separators, or class-like suffixes. Normalize + aggressively before falling back to fuzzy match: + + 1. Lowercase direct match. + 2. Lowercase + hyphens/spaces -> underscores. + 3. CamelCase -> snake_case (TodoTool -> todo_tool). + 4. Strip trailing ``_tool`` / ``-tool`` / ``tool`` suffix that + Claude-style models sometimes tack on (TodoTool_tool -> + TodoTool -> Todo -> todo). Applied twice so double-tacked + suffixes like ``TodoTool_tool`` reduce all the way. + 5. Fuzzy match (difflib, cutoff=0.7). + + See #14784 for the original reports (TodoTool_tool, Patch_tool, + BrowserClick_tool were all returning "Unknown tool" before). + + Returns the repaired name if found in valid_tool_names, else None. + """ + import re + from difflib import get_close_matches + + if not tool_name: + return None + + def _norm(s: str) -> str: + return s.lower().replace("-", "_").replace(" ", "_") + + def _camel_snake(s: str) -> str: + return re.sub(r"(? str | None: + lc = s.lower() + for suffix in ("_tool", "-tool", "tool"): + if lc.endswith(suffix): + return s[: -len(suffix)].rstrip("_-") + return None + + # Cheap fast-paths first — these cover the common case. + lowered = tool_name.lower() + if lowered in agent.valid_tool_names: + return lowered + normalized = _norm(tool_name) + if normalized in agent.valid_tool_names: + return normalized + + # Build the full candidate set for class-like emissions. + cands: set[str] = {tool_name, lowered, normalized, _camel_snake(tool_name)} + # Strip trailing tool-suffix up to twice — TodoTool_tool needs it. + for _ in range(2): + extra: set[str] = set() + for c in cands: + stripped = _strip_tool_suffix(c) + if stripped: + extra.add(stripped) + extra.add(_norm(stripped)) + extra.add(_camel_snake(stripped)) + cands |= extra + + for c in cands: + if c and c in agent.valid_tool_names: + return c + + # Fuzzy match as last resort. + matches = get_close_matches(lowered, agent.valid_tool_names, n=1, cutoff=0.7) + if matches: + return matches[0] + + return None + + + +def sanitize_api_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Fix orphaned tool_call / tool_result pairs before every LLM call. + + Runs unconditionally — not gated on whether the context compressor + is present — so orphans from session loading or manual message + manipulation are always caught. + """ + # --- Role allowlist: drop messages with roles the API won't accept --- + filtered = [] + for msg in messages: + role = msg.get("role") + if role not in _ra().AIAgent._VALID_API_ROLES: + _ra().logger.debug( + "Pre-call sanitizer: dropping message with invalid role %r", + role, + ) + continue + filtered.append(msg) + messages = filtered + + surviving_call_ids: set = set() + for msg in messages: + if msg.get("role") == "assistant": + for tc in msg.get("tool_calls") or []: + cid = _ra().AIAgent._get_tool_call_id_static(tc) + if cid: + surviving_call_ids.add(cid) + + result_call_ids: set = set() + for msg in messages: + if msg.get("role") == "tool": + cid = msg.get("tool_call_id") + if cid: + result_call_ids.add(cid) + + # 1. Drop tool results with no matching assistant call + orphaned_results = result_call_ids - surviving_call_ids + if orphaned_results: + messages = [ + m for m in messages + if not (m.get("role") == "tool" and m.get("tool_call_id") in orphaned_results) + ] + _ra().logger.debug( + "Pre-call sanitizer: removed %d orphaned tool result(s)", + len(orphaned_results), + ) + + # 2. Inject stub results for calls whose result was dropped + missing_results = surviving_call_ids - result_call_ids + if missing_results: + patched: List[Dict[str, Any]] = [] + for msg in messages: + patched.append(msg) + if msg.get("role") == "assistant": + for tc in msg.get("tool_calls") or []: + cid = _ra().AIAgent._get_tool_call_id_static(tc) + if cid in missing_results: + patched.append({ + "role": "tool", + "name": _ra().AIAgent._get_tool_call_name_static(tc), + "content": "[Result unavailable — see context summary above]", + "tool_call_id": cid, + }) + messages = patched + _ra().logger.debug( + "Pre-call sanitizer: added %d stub tool result(s)", + len(missing_results), + ) + return messages + + + +def looks_like_codex_intermediate_ack( + agent, + user_message: str, + assistant_content: str, + messages: List[Dict[str, Any]], +) -> bool: + """Detect a planning/ack message that should continue instead of ending the turn.""" + if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages): + return False + + assistant_text = agent._strip_think_blocks(assistant_content or "").strip().lower() + if not assistant_text: + return False + if len(assistant_text) > 1200: + return False + + has_future_ack = bool( + re.search(r"\b(i['’]ll|i will|let me|i can do that|i can help with that)\b", assistant_text) + ) + if not has_future_ack: + return False + + action_markers = ( + "look into", + "look at", + "inspect", + "scan", + "check", + "analyz", + "review", + "explore", + "read", + "open", + "run", + "test", + "fix", + "debug", + "search", + "find", + "walkthrough", + "report back", + "summarize", + ) + workspace_markers = ( + "directory", + "current directory", + "current dir", + "cwd", + "repo", + "repository", + "codebase", + "project", + "folder", + "filesystem", + "file tree", + "files", + "path", + ) + + user_text = (user_message or "").strip().lower() + user_targets_workspace = ( + any(marker in user_text for marker in workspace_markers) + or "~/" in user_text + or "/" in user_text + ) + assistant_mentions_action = any(marker in assistant_text for marker in action_markers) + assistant_targets_workspace = any( + marker in assistant_text for marker in workspace_markers + ) + return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action + + + + +def copy_reasoning_content_for_api(agent, source_msg: dict, api_msg: dict) -> None: + """Copy provider-facing reasoning fields onto an API replay message.""" + if source_msg.get("role") != "assistant": + return + + # 1. Explicit reasoning_content already set — preserve it verbatim + # (includes DeepSeek/Kimi's own space-placeholder written at creation + # time, and any valid reasoning content from the same provider). + # + # Exception: sessions persisted BEFORE #17341 have empty-string + # placeholders pinned at creation time. DeepSeek V4 Pro rejects + # those with HTTP 400. When the active provider enforces the + # thinking-mode echo, upgrade "" → " " on replay so stale history + # doesn't 400 the user on the next turn. + existing = source_msg.get("reasoning_content") + if isinstance(existing, str): + if existing == "" and agent._needs_thinking_reasoning_pad(): + api_msg["reasoning_content"] = " " + else: + api_msg["reasoning_content"] = existing + return + + needs_thinking_pad = agent._needs_thinking_reasoning_pad() + + # 2. Cross-provider poisoned history (#15748): on DeepSeek/Kimi, + # if the source turn has tool_calls AND a 'reasoning' field but no + # 'reasoning_content' key, the 'reasoning' text was written by a + # prior provider (e.g. MiniMax) — DeepSeek's own _build_assistant_message + # pins reasoning_content at creation time for tool-call turns, so the + # shape (reasoning set, reasoning_content absent, tool_calls present) + # is unreachable from same-provider DeepSeek history after this fix. + # Inject a single space to satisfy the API without leaking another + # provider's chain of thought to DeepSeek/Kimi. Space (not "") + # because DeepSeek V4 Pro rejects empty-string reasoning_content + # in thinking mode (refs #17341). + normalized_reasoning = source_msg.get("reasoning") + if ( + needs_thinking_pad + and source_msg.get("tool_calls") + and isinstance(normalized_reasoning, str) + and normalized_reasoning + ): + api_msg["reasoning_content"] = " " + return + + # 3. Healthy session: promote 'reasoning' field to 'reasoning_content' + # for providers that use the internal 'reasoning' key. + # This must happen before the unconditional empty-string fallback so + # genuine reasoning content is not overwritten (#15812 regression in + # PR #15478). + if isinstance(normalized_reasoning, str) and normalized_reasoning: + api_msg["reasoning_content"] = normalized_reasoning + return + + # 4. DeepSeek / Kimi thinking mode: all assistant messages need + # reasoning_content. Inject a single space to satisfy the provider's + # requirement when no explicit reasoning content is present. Covers + # both tool-call turns (already-poisoned history with no reasoning + # at all) and plain text turns. Space (not "") because DeepSeek V4 + # Pro tightened validation and rejects empty string with HTTP 400 + # ("The reasoning content in the thinking mode must be passed back + # to the API"). Refs #17341. + if needs_thinking_pad: + api_msg["reasoning_content"] = " " + return + + # 5. reasoning_content was present but not a string (e.g. None after + # context compaction). Don't pass null to the API. + api_msg.pop("reasoning_content", None) + + + +def cleanup_dead_connections(agent) -> bool: + """Detect and clean up dead TCP connections on the primary client. + + Inspects the httpx connection pool for sockets in unhealthy states + (CLOSE-WAIT, errors). If any are found, force-closes all sockets + and rebuilds the primary client from scratch. + + Returns True if dead connections were found and cleaned up. + """ + client = getattr(agent, "client", None) + if client is None: + return False + try: + http_client = getattr(client, "_client", None) + if http_client is None: + return False + transport = getattr(http_client, "_transport", None) + if transport is None: + return False + pool = getattr(transport, "_pool", None) + if pool is None: + return False + connections = ( + getattr(pool, "_connections", None) + or getattr(pool, "_pool", None) + or [] + ) + dead_count = 0 + for conn in list(connections): + # Check for connections that are idle but have closed sockets + stream = ( + getattr(conn, "_network_stream", None) + or getattr(conn, "_stream", None) + ) + if stream is None: + continue + sock = getattr(stream, "_sock", None) + if sock is None: + sock = getattr(stream, "stream", None) + if sock is not None: + sock = getattr(sock, "_sock", None) + if sock is None: + continue + # Probe socket health with a non-blocking recv peek + import socket as _socket + try: + sock.setblocking(False) + data = sock.recv(1, _socket.MSG_PEEK | _socket.MSG_DONTWAIT) + if data == b"": + dead_count += 1 + except BlockingIOError: + pass # No data available — socket is healthy + except OSError: + dead_count += 1 + finally: + try: + sock.setblocking(True) + except OSError: + pass + if dead_count > 0: + _ra().logger.warning( + "Found %d dead connection(s) in client pool — rebuilding client", + dead_count, + ) + agent._replace_primary_openai_client(reason="dead_connection_cleanup") + return True + except Exception as exc: + _ra().logger.debug("Dead connection check error: %s", exc) + return False + + + +def extract_api_error_context(error: Exception) -> Dict[str, Any]: + """Extract structured rate-limit details from provider errors.""" + context: Dict[str, Any] = {} + + body = getattr(error, "body", None) + payload = None + if isinstance(body, dict): + payload = body.get("error") if isinstance(body.get("error"), dict) else body + if isinstance(payload, dict): + reason = payload.get("code") or payload.get("type") or payload.get("error") + if isinstance(reason, str) and reason.strip(): + context["reason"] = reason.strip() + message = payload.get("message") or payload.get("error_description") + if isinstance(message, str) and message.strip(): + context["message"] = message.strip() + for key in ("resets_at", "reset_at"): + value = payload.get(key) + if value not in {None, ""}: + context["reset_at"] = value + break + retry_after = payload.get("retry_after") + if retry_after not in {None, ""} and "reset_at" not in context: + try: + context["reset_at"] = time.time() + float(retry_after) + except (TypeError, ValueError): + pass + + response = getattr(error, "response", None) + headers = getattr(response, "headers", None) + if headers: + retry_after = headers.get("retry-after") or headers.get("Retry-After") + if retry_after and "reset_at" not in context: + try: + context["reset_at"] = time.time() + float(retry_after) + except (TypeError, ValueError): + pass + ratelimit_reset = headers.get("x-ratelimit-reset") + if ratelimit_reset and "reset_at" not in context: + context["reset_at"] = ratelimit_reset + + if "message" not in context: + raw_message = str(error).strip() + if raw_message: + context["message"] = raw_message[:500] + + if "reset_at" not in context: + message = context.get("message") or "" + if isinstance(message, str): + delay_match = re.search(r"quotaResetDelay[:\s\"]+(\\d+(?:\\.\\d+)?)(ms|s)", message, re.IGNORECASE) + if delay_match: + value = float(delay_match.group(1)) + seconds = value / 1000.0 if delay_match.group(2).lower() == "ms" else value + context["reset_at"] = time.time() + seconds + else: + sec_match = re.search( + r"retry\s+(?:after\s+)?(\d+(?:\.\d+)?)\s*(?:sec|secs|seconds|s\b)", + message, + re.IGNORECASE, + ) + if sec_match: + context["reset_at"] = time.time() + float(sec_match.group(1)) + + return context + + + +def apply_pending_steer_to_tool_results(agent, messages: list, num_tool_msgs: int) -> None: + """Append any pending /steer text to the last tool result in this turn. + + Called at the end of a tool-call batch, before the next API call. + The steer is appended to the last ``role:"tool"`` message's content + with a clear marker so the model understands it came from the user + and NOT from the tool itself. Role alternation is preserved — + nothing new is inserted, we only modify existing content. + + Args: + messages: The running messages list. + num_tool_msgs: Number of tool results appended in this batch; + used to locate the tail slice safely. + """ + if num_tool_msgs <= 0 or not messages: + return + steer_text = agent._drain_pending_steer() + if not steer_text: + return + # Find the last tool-role message in the recent tail. Skipping + # non-tool messages defends against future code appending + # something else at the boundary. + target_idx = None + for j in range(len(messages) - 1, max(len(messages) - num_tool_msgs - 1, -1), -1): + msg = messages[j] + if isinstance(msg, dict) and msg.get("role") == "tool": + target_idx = j + break + if target_idx is None: + # No tool result in this batch (e.g. all skipped by interrupt); + # put the steer back so the caller's fallback path can deliver + # it as a normal next-turn user message. + _lock = getattr(agent, "_pending_steer_lock", None) + if _lock is not None: + with _lock: + if agent._pending_steer: + agent._pending_steer = agent._pending_steer + "\n" + steer_text + else: + agent._pending_steer = steer_text + else: + existing = getattr(agent, "_pending_steer", None) + agent._pending_steer = (existing + "\n" + steer_text) if existing else steer_text + return + marker = f"\n\nUser guidance: {steer_text}" + existing_content = messages[target_idx].get("content", "") + if not isinstance(existing_content, str): + # Anthropic multimodal content blocks — preserve them and append + # a text block at the end. + try: + blocks = list(existing_content) if existing_content else [] + blocks.append({"type": "text", "text": marker.lstrip()}) + messages[target_idx]["content"] = blocks + except Exception: + # Fall back to string replacement if content shape is unexpected. + messages[target_idx]["content"] = f"{existing_content}{marker}" + else: + messages[target_idx]["content"] = existing_content + marker + _ra().logger.info( + "Delivered /steer to agent after tool batch (%d chars): %s", + len(steer_text), + steer_text[:120] + ("..." if len(steer_text) > 120 else ""), + ) + + + +def force_close_tcp_sockets(client: Any) -> int: + """Force-close underlying TCP sockets to prevent CLOSE-WAIT accumulation. + + When a provider drops a connection mid-stream, httpx's ``client.close()`` + performs a graceful shutdown which leaves sockets in CLOSE-WAIT until the + OS times them out (often minutes). This method walks the httpx transport + pool and issues ``socket.shutdown(SHUT_RDWR)`` + ``socket.close()`` to + force an immediate TCP RST, freeing the file descriptors. + + Returns the number of sockets force-closed. + """ + import socket as _socket + + closed = 0 + try: + http_client = getattr(client, "_client", None) + if http_client is None: + return 0 + transport = getattr(http_client, "_transport", None) + if transport is None: + return 0 + pool = getattr(transport, "_pool", None) + if pool is None: + return 0 + # httpx uses httpcore connection pools; connections live in + # _connections (list) or _pool (list) depending on version. + connections = ( + getattr(pool, "_connections", None) + or getattr(pool, "_pool", None) + or [] + ) + for conn in list(connections): + stream = ( + getattr(conn, "_network_stream", None) + or getattr(conn, "_stream", None) + ) + if stream is None: + continue + sock = getattr(stream, "_sock", None) + if sock is None: + sock = getattr(stream, "stream", None) + if sock is not None: + sock = getattr(sock, "_sock", None) + if sock is None: + continue + try: + sock.shutdown(_socket.SHUT_RDWR) + except OSError: + pass + try: + sock.close() + except OSError: + pass + closed += 1 + except Exception as exc: + _ra().logger.debug("Force-close TCP sockets sweep error: %s", exc) + return closed + + + +__all__ = [ + "convert_to_trajectory_format", + "sanitize_tool_call_arguments", + "repair_message_sequence", + "strip_think_blocks", + "recover_with_credential_pool", + "try_recover_primary_transport", + "drop_thinking_only_and_merge_users", + "restore_primary_runtime", + "extract_reasoning", + "dump_api_request_debug", + "anthropic_prompt_cache_policy", + "create_openai_client", + "switch_model", + "invoke_tool", + "repair_tool_call", + "sanitize_api_messages", + "looks_like_codex_intermediate_ack", + "copy_reasoning_content_for_api", + "cleanup_dead_connections", + "extract_api_error_context", + "apply_pending_steer_to_tool_results", + "force_close_tcp_sockets", +] diff --git a/agent/background_review.py b/agent/background_review.py new file mode 100644 index 00000000000..83292029c6c --- /dev/null +++ b/agent/background_review.py @@ -0,0 +1,570 @@ +"""Background memory/skill review — fork the agent to evaluate the turn. + +After every turn, ``AIAgent.run_conversation`` may call +:func:`spawn_background_review` to fire off a daemon thread that replays +the conversation snapshot in a forked :class:`AIAgent` and asks itself +"should any skill/memory be saved or updated?". Writes go straight to +the memory + skill stores. Main conversation and prompt cache are never +touched. + +The fork inherits the parent's live runtime (provider, model, base_url, +credentials, cached system prompt) so it hits the same prefix cache and +uses the same auth. It runs with a tool whitelist limited to memory and +skill management tools; everything else is denied at runtime. + +See the ``hermes-agent-dev`` skill (``references/self-improvement-loop.md``) +for invariants and PR review criteria. +""" + +from __future__ import annotations + +import contextlib +import json +import logging +import os +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + + +# Review-prompt strings — used by ``spawn_background_review_thread`` to build +# the user-message that the forked review agent receives. AIAgent exposes +# them as class attributes (``_MEMORY_REVIEW_PROMPT`` etc.) for back-compat; +# the actual text lives here so future edits are one-place. +_MEMORY_REVIEW_PROMPT = ( + "Review the conversation above and consider saving to memory if appropriate.\n\n" + "Focus on:\n" + "1. Has the user revealed things about themselves — their persona, desires, " + "preferences, or personal details worth remembering?\n" + "2. Has the user expressed expectations about how you should behave, their work " + "style, or ways they want you to operate?\n\n" + "If something stands out, save it using the memory tool. " + "If nothing is worth saving, just say 'Nothing to save.' and stop." +) + +_SKILL_REVIEW_PROMPT = ( + "Review the conversation above and update the skill library. Be " + "ACTIVE — most sessions produce at least one skill update, even if " + "small. A pass that does nothing is a missed learning opportunity, " + "not a neutral outcome.\n\n" + "Target shape of the library: CLASS-LEVEL skills, each with a rich " + "SKILL.md and a `references/` directory for session-specific detail. " + "Not a long flat list of narrow one-session-one-skill entries. This " + "shapes HOW you update, not WHETHER you update.\n\n" + "Signals to look for (any one of these warrants action):\n" + " • User corrected your style, tone, format, legibility, or " + "verbosity. Frustration signals like 'stop doing X', 'this is too " + "verbose', 'don't format like this', 'why are you explaining', " + "'just give me the answer', 'you always do Y and I hate it', or an " + "explicit 'remember this' are FIRST-CLASS skill signals, not just " + "memory signals. Update the relevant skill(s) to embed the " + "preference so the next session starts already knowing.\n" + " • User corrected your workflow, approach, or sequence of steps. " + "Encode the correction as a pitfall or explicit step in the skill " + "that governs that class of task.\n" + " • Non-trivial technique, fix, workaround, debugging path, or " + "tool-usage pattern emerged that a future session would benefit " + "from. Capture it.\n" + " • A skill that got loaded or consulted this session turned out " + "to be wrong, missing a step, or outdated. Patch it NOW.\n\n" + "Preference order — prefer the earliest action that fits, but do " + "pick one when a signal above fired:\n" + " 1. UPDATE A CURRENTLY-LOADED SKILL. Look back through the " + "conversation for skills the user loaded via /skill-name or you " + "read via skill_view. If any of them covers the territory of the " + "new learning, PATCH that one first. It is the skill that was in " + "play, so it's the right one to extend.\n" + " 2. UPDATE AN EXISTING UMBRELLA (via skills_list + skill_view). " + "If no loaded skill fits but an existing class-level skill does, " + "patch it. Add a subsection, a pitfall, or broaden a trigger.\n" + " 3. ADD A SUPPORT FILE under an existing umbrella. Skills can be " + "packaged with three kinds of support files — use the right " + "directory per kind:\n" + " • `references/.md` — session-specific detail (error " + "transcripts, reproduction recipes, provider quirks) AND " + "condensed knowledge banks: quoted research, API docs, external " + "authoritative excerpts, or domain notes you found while working " + "on the problem. Write it concise and for the value of the task, " + "not as a full mirror of upstream docs.\n" + " • `templates/.` — starter files meant to be " + "copied and modified (boilerplate configs, scaffolding, a " + "known-good example the agent can `reproduce with modifications`).\n" + " • `scripts/.` — statically re-runnable actions " + "the skill can invoke directly (verification scripts, fixture " + "generators, deterministic probes, anything the agent should run " + "rather than hand-type each time).\n" + " Add support files via skill_manage action=write_file with " + "file_path starting 'references/', 'templates/', or 'scripts/'. " + "The umbrella's SKILL.md should gain a one-line pointer to any " + "new support file so future agents know it exists.\n" + " 4. CREATE A NEW CLASS-LEVEL UMBRELLA SKILL when no existing " + "skill covers the class. The name MUST be at the class level. " + "The name MUST NOT be a specific PR number, error string, feature " + "codename, library-alone name, or 'fix-X / debug-Y / audit-Z-today' " + "session artifact. If the proposed name only makes sense for " + "today's task, it's wrong — fall back to (1), (2), or (3).\n\n" + "User-preference embedding (important): when the user expressed a " + "style/format/workflow preference, the update belongs in the " + "SKILL.md body, not just in memory. Memory captures 'who the user " + "is and what the current situation and state of your operations " + "are'; skills capture 'how to do this class of task for this " + "user'. When they complain about how you handled a task, the " + "skill that governs that task needs to carry the lesson.\n\n" + "If you notice two existing skills that overlap, note it in your " + "reply — the background curator handles consolidation at scale.\n\n" + "Do NOT capture (these become persistent self-imposed constraints " + "that bite you later when the environment changes):\n" + " • Environment-dependent failures: missing binaries, fresh-install " + "errors, post-migration path mismatches, 'command not found', " + "unconfigured credentials, uninstalled packages. The user can fix " + "these — they are not durable rules.\n" + " • Negative claims about tools or features ('browser tools do not " + "work', 'X tool is broken', 'cannot use Y from execute_code'). These " + "harden into refusals the agent cites against itself for months " + "after the actual problem was fixed.\n" + " • Session-specific transient errors that resolved before the " + "conversation ended. If retrying worked, the lesson is the retry " + "pattern, not the original failure.\n" + " • One-off task narratives. A user asking 'summarize today's " + "market' or 'analyze this PR' is not a class of work that warrants " + "a skill.\n\n" + "If a tool failed because of setup state, capture the FIX (install " + "command, config step, env var to set) under an existing setup or " + "troubleshooting skill — never 'this tool does not work' as a " + "standalone constraint.\n\n" + "'Nothing to save.' is a real option but should NOT be the " + "default. If the session ran smoothly with no corrections and " + "produced no new technique, just say 'Nothing to save.' and stop. " + "Otherwise, act." +) + +_COMBINED_REVIEW_PROMPT = ( + "Review the conversation above and update two things:\n\n" + "**Memory**: who the user is. Did the user reveal persona, " + "desires, preferences, personal details, or expectations about " + "how you should behave? Save facts about the user and durable " + "preferences with the memory tool.\n\n" + "**Skills**: how to do this class of task. Be ACTIVE — most " + "sessions produce at least one skill update. A pass that does " + "nothing is a missed learning opportunity, not a neutral outcome.\n\n" + "Target shape of the skill library: CLASS-LEVEL skills with a rich " + "SKILL.md and a `references/` directory for session-specific detail. " + "Not a long flat list of narrow one-session-one-skill entries.\n\n" + "Signals that warrant a skill update (any one is enough):\n" + " • User corrected your style, tone, format, legibility, " + "verbosity, or approach. Frustration is a FIRST-CLASS skill " + "signal, not just a memory signal. 'stop doing X', 'don't format " + "like this', 'I hate when you Y' — embed the lesson in the skill " + "that governs that task so the next session starts fixed.\n" + " • Non-trivial technique, fix, workaround, or debugging path " + "emerged.\n" + " • A skill that was loaded or consulted turned out wrong, " + "missing, or outdated — patch it now.\n\n" + "Preference order for skills — pick the earliest that fits:\n" + " 1. UPDATE A CURRENTLY-LOADED SKILL. Check what skills were " + "loaded via /skill-name or skill_view in the conversation. If one " + "of them covers the learning, PATCH it first. It was in play; " + "it's the right place.\n" + " 2. UPDATE AN EXISTING UMBRELLA (skills_list + skill_view to " + "find the right one). Patch it.\n" + " 3. ADD A SUPPORT FILE under an existing umbrella via " + "skill_manage action=write_file. Three kinds: " + "`references/.md` for session-specific detail OR condensed " + "knowledge banks (quoted research, API docs excerpts, domain " + "notes) written concise and task-focused; `templates/.` " + "for starter files meant to be copied and modified; " + "`scripts/.` for statically re-runnable actions " + "(verification, fixture generators, probes). Add a one-line " + "pointer in SKILL.md so future agents find them.\n" + " 4. CREATE A NEW CLASS-LEVEL UMBRELLA when nothing exists. " + "Name at the class level — NOT a PR number, error string, " + "codename, library-alone name, or 'fix-X / debug-Y' session " + "artifact. If the name only fits today's task, fall back to (1), " + "(2), or (3).\n\n" + "User-preference embedding: when the user complains about how " + "you handled a task, update the skill that governs that task — " + "memory alone isn't enough. Memory says 'who the user is and " + "what the current situation and state of your operations are'; " + "skills say 'how to do this class of task for this user'. Both " + "should carry user-preference lessons when relevant.\n\n" + "If you notice overlapping existing skills, mention it — the " + "background curator handles consolidation.\n\n" + "Do NOT capture as skills (these become persistent self-imposed " + "constraints that bite you later when the environment changes):\n" + " • Environment-dependent failures: missing binaries, fresh-install " + "errors, post-migration path mismatches, 'command not found', " + "unconfigured credentials, uninstalled packages. The user can fix " + "these — they are not durable rules.\n" + " • Negative claims about tools or features ('browser tools do not " + "work', 'X tool is broken', 'cannot use Y from execute_code'). These " + "harden into refusals the agent cites against itself for months " + "after the actual problem was fixed.\n" + " • Session-specific transient errors that resolved before the " + "conversation ended. If retrying worked, the lesson is the retry " + "pattern, not the original failure.\n" + " • One-off task narratives. A user asking 'summarize today's " + "market' or 'analyze this PR' is not a class of work that warrants " + "a skill.\n\n" + "If a tool failed because of setup state, capture the FIX (install " + "command, config step, env var to set) under an existing setup or " + "troubleshooting skill — never 'this tool does not work' as a " + "standalone constraint.\n\n" + "Act on whichever of the two dimensions has real signal. If " + "genuinely nothing stands out on either, say 'Nothing to save.' " + "and stop — but don't reach for that conclusion as a default." +) + + + +def summarize_background_review_actions( + review_messages: List[Dict], + prior_snapshot: List[Dict], +) -> List[str]: + """Build the human-facing action summary for a background review pass. + + Walks the review agent's session messages and collects "successful tool + action" descriptions to surface to the user (e.g. "Memory updated"). + Tool messages already present in ``prior_snapshot`` are skipped so we + don't re-surface stale results from the prior conversation that the + review agent inherited via ``conversation_history`` (issue #14944). + + Matching is by ``tool_call_id`` when available, with a content-equality + fallback for tool messages that lack one. + """ + existing_tool_call_ids = set() + existing_tool_contents = set() + for prior in prior_snapshot or []: + if not isinstance(prior, dict) or prior.get("role") != "tool": + continue + tcid = prior.get("tool_call_id") + if tcid: + existing_tool_call_ids.add(tcid) + else: + content = prior.get("content") + if isinstance(content, str): + existing_tool_contents.add(content) + + actions: List[str] = [] + for msg in review_messages or []: + if not isinstance(msg, dict) or msg.get("role") != "tool": + continue + tcid = msg.get("tool_call_id") + if tcid and tcid in existing_tool_call_ids: + continue + if not tcid: + content_str = msg.get("content") + if isinstance(content_str, str) and content_str in existing_tool_contents: + continue + try: + data = json.loads(msg.get("content", "{}")) + except (json.JSONDecodeError, TypeError): + continue + if not isinstance(data, dict) or not data.get("success"): + continue + message = data.get("message", "") + target = data.get("target", "") + if "created" in message.lower(): + actions.append(message) + elif "updated" in message.lower(): + actions.append(message) + elif "added" in message.lower() or (target and "add" in message.lower()): + label = "Memory" if target == "memory" else "User profile" if target == "user" else target + actions.append(f"{label} updated") + elif "Entry added" in message: + label = "Memory" if target == "memory" else "User profile" if target == "user" else target + actions.append(f"{label} updated") + elif "removed" in message.lower() or "replaced" in message.lower(): + label = "Memory" if target == "memory" else "User profile" if target == "user" else target + actions.append(f"{label} updated") + return actions + + +def build_memory_write_metadata( + agent: Any, + *, + write_origin: Optional[str] = None, + execution_context: Optional[str] = None, + task_id: Optional[str] = None, + tool_call_id: Optional[str] = None, +) -> Dict[str, Any]: + """Build provenance metadata for external memory-provider mirrors.""" + metadata: Dict[str, Any] = { + "write_origin": write_origin or getattr(agent, "_memory_write_origin", "assistant_tool"), + "execution_context": ( + execution_context + or getattr(agent, "_memory_write_context", "foreground") + ), + "session_id": agent.session_id or "", + "parent_session_id": agent._parent_session_id or "", + "platform": agent.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"), + "tool_name": "memory", + } + if task_id: + metadata["task_id"] = task_id + if tool_call_id: + metadata["tool_call_id"] = tool_call_id + return {k: v for k, v in metadata.items() if v not in {None, ""}} + + +def _run_review_in_thread( + agent: Any, + messages_snapshot: List[Dict], + prompt: str, +) -> None: + """Worker function executed in the background-review daemon thread. + + Spawns a forked ``AIAgent`` inheriting the parent's runtime, runs the + review prompt, and surfaces a compact action summary back to the user + via ``agent._safe_print`` and ``agent.background_review_callback``. + """ + # Local import to avoid a hard circular dep at module load. + from run_agent import AIAgent + from tools.terminal_tool import set_approval_callback as _set_approval_callback + + # Install a non-interactive approval callback on this worker + # thread so any dangerous-command guard the review agent trips + # resolves to "deny" instead of falling back to input() -- which + # deadlocks against the parent's prompt_toolkit TUI (#15216). + # Same pattern as _subagent_auto_deny in tools/delegate_tool.py. + def _bg_review_auto_deny(command, description, **kwargs): + logger.warning( + "Background review auto-denied dangerous command: %s (%s)", + command, description, + ) + return "deny" + try: + _set_approval_callback(_bg_review_auto_deny) + except Exception: + pass + + review_agent = None + review_messages: List[Dict] = [] + try: + with open(os.devnull, "w", encoding="utf-8") as _devnull, \ + contextlib.redirect_stdout(_devnull), \ + contextlib.redirect_stderr(_devnull): + # Inherit the parent agent's live runtime (provider, model, + # base_url, api_key, api_mode) so the fork uses the exact + # same credentials the main turn is using. Without this, + # AIAgent.__init__ re-runs auto-resolution from env vars, + # which fails for OAuth-only providers, session-scoped + # creds, or credential-pool setups where the resolver can't + # reconstruct auth from scratch -- producing the spurious + # "No LLM provider configured" warning at end of turn. + _parent_runtime = agent._current_main_runtime() + _parent_api_mode = _parent_runtime.get("api_mode") or None + # The review fork needs to call agent-loop tools (memory, + # skill_manage). Those tools require Hermes' own dispatch, + # which the codex_app_server runtime bypasses entirely + # (it runs the turn inside codex's subprocess). So when + # the parent is on codex_app_server, downgrade the review + # fork to codex_responses — same auth/credentials, but + # talks to the OpenAI Responses API directly so Hermes + # owns the loop and the agent-loop tools dispatch. + if _parent_api_mode == "codex_app_server": + _parent_api_mode = "codex_responses" + # skip_memory=True keeps the review fork from + # touching external memory plugins (honcho, mem0, + # supermemory, etc.). Without it, the fork's + # __init__ rebuilds its own _memory_manager from + # config, scoped to the parent's session_id, and + # run_conversation() then leaks the harness prompt + # into the user's real memory namespace via three + # ingestion sites: on_turn_start (cadence + turn + # message), prefetch_all (recall query), and + # sync_all (harness prompt + review output recorded + # as a (user, assistant) turn pair). Built-in + # MEMORY.md / USER.md state is re-bound from the + # parent below so memory(action="add") writes from + # the review still land on disk; the review just + # has zero side effects on external providers. + review_agent = AIAgent( + model=agent.model, + max_iterations=16, + quiet_mode=True, + platform=agent.platform, + provider=agent.provider, + api_mode=_parent_api_mode, + base_url=_parent_runtime.get("base_url") or None, + api_key=_parent_runtime.get("api_key") or None, + credential_pool=getattr(agent, "_credential_pool", None), + parent_session_id=agent.session_id, + skip_memory=True, + ) + review_agent._memory_write_origin = "background_review" + review_agent._memory_write_context = "background_review" + review_agent._memory_store = agent._memory_store + review_agent._memory_enabled = agent._memory_enabled + review_agent._user_profile_enabled = agent._user_profile_enabled + review_agent._memory_nudge_interval = 0 + review_agent._skill_nudge_interval = 0 + # Suppress all status/warning emits from the fork so the + # user only sees the final successful-action summary. + # Without this, mid-review "Iteration budget exhausted", + # rate-limit retries, compression warnings, and other + # lifecycle messages bubble up through _emit_status -> + # _vprint and leak past the stdout redirect (they go via + # _print_fn/status_callback, which bypass sys.stdout). + review_agent.suppress_status_output = True + # Inherit the parent's cached system prompt verbatim so + # the review fork's outbound HTTP request hits the same + # Anthropic/OpenRouter prefix cache the parent warmed. + # Without this, the fork rebuilds the system prompt from + # scratch (fresh _hermes_now() timestamp, fresh + # session_id, narrower toolset → different skills_prompt) + # and the byte-exact prefix-cache key misses. See + # issue #25322 and PR #17276 for the full analysis + + # measured impact (~26% end-to-end cost reduction on + # Sonnet 4.5). + review_agent._cached_system_prompt = agent._cached_system_prompt + # Defensive: pin session_start + session_id to the + # parent's so any code path that re-renders parts of + # the system prompt (compression, plugin hooks) still + # produces byte-identical output. The cached-prompt + # assignment above already short-circuits the normal + # rebuild path, but these pins guarantee parity even + # if a future code path bypasses the cache. + review_agent.session_start = agent.session_start + review_agent.session_id = agent.session_id + + from model_tools import get_tool_definitions + from hermes_cli.plugins import ( + set_thread_tool_whitelist, + clear_thread_tool_whitelist, + ) + + review_whitelist = { + t["function"]["name"] + for t in get_tool_definitions( + enabled_toolsets=["memory", "skills"], + quiet_mode=True, + ) + } + set_thread_tool_whitelist( + review_whitelist, + deny_msg_fmt=( + "Background review denied non-whitelisted tool: " + "{tool_name}. Only memory/skill tools are allowed." + ), + ) + try: + review_agent.run_conversation( + user_message=( + prompt + + "\n\nYou can only call memory and skill " + "management tools. Other tools will be denied " + "at runtime — do not attempt them." + ), + conversation_history=messages_snapshot, + ) + finally: + clear_thread_tool_whitelist() + + # Tear down memory providers while stdout is still + # redirected so background thread teardown (Honcho flush, + # Hindsight sync, etc.) stays silent. The finally block + # below is a safety net for the exception path. + try: + review_agent.shutdown_memory_provider() + except Exception: + pass + try: + review_agent.close() + except Exception: + pass + review_messages = list(getattr(review_agent, "_session_messages", [])) + review_agent = None + + # Scan the review agent's messages for successful tool actions + # and surface a compact summary to the user. Tool messages + # already present in messages_snapshot must be skipped, since + # the review agent inherits that history and would otherwise + # re-surface stale "created"/"updated" messages from the prior + # conversation as if they just happened (issue #14944). + actions = summarize_background_review_actions( + review_messages, + messages_snapshot, + ) + + if actions: + summary = " · ".join(dict.fromkeys(actions)) + agent._safe_print( + f" 💾 Self-improvement review: {summary}" + ) + _bg_cb = agent.background_review_callback + if _bg_cb: + try: + _bg_cb( + f"💾 Self-improvement review: {summary}" + ) + except Exception: + pass + + except Exception as e: + logger.warning("Background memory/skill review failed: %s", e) + agent._emit_auxiliary_failure("background review", e) + finally: + # Safety-net cleanup for the exception path. Normal + # completion already shut down inside redirect_stdout above. + # Re-open devnull here so any teardown output (Honcho flush, + # Hindsight sync, background thread joins) stays silent even + # on the exception path where redirect_stdout already exited. + if review_agent is not None: + try: + with open(os.devnull, "w", encoding="utf-8") as _fn, \ + contextlib.redirect_stdout(_fn), \ + contextlib.redirect_stderr(_fn): + try: + review_agent.shutdown_memory_provider() + except Exception: + pass + try: + review_agent.close() + except Exception: + pass + except Exception: + pass + # Clear the approval callback on this bg-review thread so a + # recycled thread-id doesn't inherit a stale reference. + try: + _set_approval_callback(None) + except Exception: + pass + + +def spawn_background_review_thread( + agent: Any, + messages_snapshot: List[Dict], + review_memory: bool = False, + review_skills: bool = False, +): + """Build the review thread target and prompt for a background review. + + Returns a ``(target, prompt)`` tuple. The caller (``AIAgent._spawn_background_review``) + owns the actual ``threading.Thread`` construction so test-level patches + of ``run_agent.threading.Thread`` keep working. + """ + # Pick the right prompt based on which triggers fired. Allow per-agent + # override (the prompts moved to module-level constants but old code paths + # that set agent._MEMORY_REVIEW_PROMPT etc. directly keep working). + if review_memory and review_skills: + prompt = getattr(agent, "_COMBINED_REVIEW_PROMPT", _COMBINED_REVIEW_PROMPT) + elif review_memory: + prompt = getattr(agent, "_MEMORY_REVIEW_PROMPT", _MEMORY_REVIEW_PROMPT) + else: + prompt = getattr(agent, "_SKILL_REVIEW_PROMPT", _SKILL_REVIEW_PROMPT) + + def _target() -> None: + _run_review_in_thread(agent, messages_snapshot, prompt) + + return _target, prompt + + +__all__ = [ + "_MEMORY_REVIEW_PROMPT", + "_SKILL_REVIEW_PROMPT", + "_COMBINED_REVIEW_PROMPT", + "spawn_background_review_thread", + "summarize_background_review_actions", + "build_memory_write_metadata", +] diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py new file mode 100644 index 00000000000..1bf1ebc651e --- /dev/null +++ b/agent/chat_completion_helpers.py @@ -0,0 +1,2043 @@ +"""Helper functions for the chat-completions code path. + +Extracted from :class:`AIAgent` for cleanliness — bodies of the +non-streaming API call, request kwargs builder, assistant-message +materializer, provider-fallback activator, max-iterations handler, +and per-turn resource cleanup. + +Each function takes the parent ``AIAgent`` as its first argument +(``agent``). :class:`AIAgent` keeps thin forwarder methods so call +sites unchanged. Symbols that tests patch on ``run_agent`` (e.g. +``cleanup_vm`` / ``cleanup_browser`` in +``test_zombie_process_cleanup.py``) are resolved through +:func:`_ra` so the patch contract is preserved. +""" + +from __future__ import annotations + +import concurrent.futures +import contextvars +import copy +import json +import logging +import os +import random +import re +import sys +import threading +import time +import uuid +from datetime import datetime +from pathlib import Path +from types import SimpleNamespace +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import urlparse, parse_qs, urlunparse + +from hermes_cli.timeouts import get_provider_request_timeout +from agent.error_classifier import classify_api_error, FailoverReason +from agent.model_metadata import is_local_endpoint +from agent.message_sanitization import ( + _sanitize_surrogates, + _sanitize_messages_surrogates, + _sanitize_structure_surrogates, + _sanitize_messages_non_ascii, + _sanitize_tools_non_ascii, + _sanitize_structure_non_ascii, + _strip_images_from_messages, + _strip_non_ascii, + _repair_tool_call_arguments, + _escape_invalid_chars_in_json_strings, +) +from agent.tool_dispatch_helpers import ( + _is_multimodal_tool_result, + _multimodal_text_summary, +) +from agent.retry_utils import jittered_backoff +from agent.tool_guardrails import ( + ToolGuardrailDecision, + append_toolguard_guidance, + toolguard_synthetic_result, +) +from tools.terminal_tool import is_persistent_env +from utils import base_url_host_matches, base_url_hostname + +logger = logging.getLogger(__name__) + + +def _ra(): + """Lazy ``run_agent`` reference. + + Used to honor test patches like + ``patch("run_agent.cleanup_vm")`` / ``patch("run_agent.cleanup_browser")`` + that target symbols imported into ``run_agent``'s namespace. + """ + import run_agent + return run_agent + + + +def interruptible_api_call(agent, api_kwargs: dict): + """ + Run the API call in a background thread so the main conversation loop + can detect interrupts without waiting for the full HTTP round-trip. + + Each worker thread gets its own OpenAI client instance. Interrupts only + close that worker-local client, so retries and other requests never + inherit a closed transport. + + Includes a stale-call detector: if no response arrives within the + configured timeout, the connection is killed and an error raised so + the main retry loop can try again with backoff / credential rotation / + provider fallback. + """ + result = {"response": None, "error": None} + request_client_holder = {"client": None} + + def _call(): + try: + if agent.api_mode == "codex_responses": + request_client_holder["client"] = agent._create_request_openai_client( + reason="codex_stream_request", + api_kwargs=api_kwargs, + ) + result["response"] = agent._run_codex_stream( + api_kwargs, + client=request_client_holder["client"], + on_first_delta=getattr(agent, "_codex_on_first_delta", None), + ) + elif agent.api_mode == "anthropic_messages": + result["response"] = agent._anthropic_messages_create(api_kwargs) + elif agent.api_mode == "bedrock_converse": + # Bedrock uses boto3 directly — no OpenAI client needed. + # normalize_converse_response produces an OpenAI-compatible + # SimpleNamespace so the rest of the agent loop can treat + # bedrock responses like chat_completions responses. + from agent.bedrock_adapter import ( + _get_bedrock_runtime_client, + invalidate_runtime_client, + is_stale_connection_error, + normalize_converse_response, + ) + region = api_kwargs.pop("__bedrock_region__", "us-east-1") + api_kwargs.pop("__bedrock_converse__", None) + client = _get_bedrock_runtime_client(region) + try: + raw_response = client.converse(**api_kwargs) + except Exception as _bedrock_exc: + # Evict the cached client on stale-connection failures + # so the outer retry loop builds a fresh client/pool. + if is_stale_connection_error(_bedrock_exc): + invalidate_runtime_client(region) + raise + result["response"] = normalize_converse_response(raw_response) + else: + request_client_holder["client"] = agent._create_request_openai_client( + reason="chat_completion_request", + api_kwargs=api_kwargs, + ) + result["response"] = request_client_holder["client"].chat.completions.create(**api_kwargs) + except Exception as e: + result["error"] = e + finally: + request_client = request_client_holder.get("client") + if request_client is not None: + agent._close_request_openai_client(request_client, reason="request_complete") + + # ── Stale-call timeout (mirrors streaming stale detector) ──────── + # Non-streaming calls return nothing until the full response is + # ready. Without this, a hung provider can block for the full + # httpx timeout (default 1800s) with zero feedback. The stale + # detector kills the connection early so the main retry loop can + # apply richer recovery (credential rotation, provider fallback). + _stale_timeout = agent._compute_non_stream_stale_timeout( + api_kwargs.get("messages", []) + ) + + _call_start = time.time() + agent._touch_activity("waiting for non-streaming API response") + + t = threading.Thread(target=_call, daemon=True) + t.start() + _poll_count = 0 + while t.is_alive(): + t.join(timeout=0.3) + _poll_count += 1 + + # Touch activity every ~30s so the gateway's inactivity + # monitor knows we're alive while waiting for the response. + if _poll_count % 100 == 0: # 100 × 0.3s = 30s + _elapsed = time.time() - _call_start + agent._touch_activity( + f"waiting for non-streaming response ({int(_elapsed)}s elapsed)" + ) + + # Stale-call detector: kill the connection if no response + # arrives within the configured timeout. + _elapsed = time.time() - _call_start + if _elapsed > _stale_timeout: + _est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4 + logger.warning( + "Non-streaming API call stale for %.0fs (threshold %.0fs). " + "model=%s context=~%s tokens. Killing connection.", + _elapsed, _stale_timeout, + api_kwargs.get("model", "unknown"), f"{_est_ctx:,}", + ) + agent._emit_status( + f"⚠️ No response from provider for {int(_elapsed)}s " + f"(non-streaming, model: {api_kwargs.get('model', 'unknown')}). " + f"Aborting call." + ) + try: + if agent.api_mode == "anthropic_messages": + agent._anthropic_client.close() + agent._rebuild_anthropic_client() + else: + rc = request_client_holder.get("client") + if rc is not None: + agent._close_request_openai_client(rc, reason="stale_call_kill") + except Exception: + pass + agent._touch_activity( + f"stale non-streaming call killed after {int(_elapsed)}s" + ) + # Wait briefly for the thread to notice the closed connection. + t.join(timeout=2.0) + if result["error"] is None and result["response"] is None: + result["error"] = TimeoutError( + f"Non-streaming API call timed out after {int(_elapsed)}s " + f"with no response (threshold: {int(_stale_timeout)}s)" + ) + break + + if agent._interrupt_requested: + # Force-close the in-flight worker-local HTTP connection to stop + # token generation without poisoning the shared client used to + # seed future retries. + try: + if agent.api_mode == "anthropic_messages": + agent._anthropic_client.close() + agent._rebuild_anthropic_client() + else: + request_client = request_client_holder.get("client") + if request_client is not None: + agent._close_request_openai_client(request_client, reason="interrupt_abort") + except Exception: + pass + raise InterruptedError("Agent interrupted during API call") + if result["error"] is not None: + raise result["error"] + return result["response"] + + + +def build_api_kwargs(agent, api_messages: list) -> dict: + """Build the keyword arguments dict for the active API mode.""" + tools_for_api = agent.tools + + if agent.api_mode == "anthropic_messages": + _transport = agent._get_transport() + anthropic_messages = agent._prepare_anthropic_messages_for_api(api_messages) + ctx_len = getattr(agent, "context_compressor", None) + ctx_len = ctx_len.context_length if ctx_len else None + ephemeral_out = getattr(agent, "_ephemeral_max_output_tokens", None) + if ephemeral_out is not None: + agent._ephemeral_max_output_tokens = None # consume immediately + return _transport.build_kwargs( + model=agent.model, + messages=anthropic_messages, + tools=tools_for_api, + max_tokens=ephemeral_out if ephemeral_out is not None else agent.max_tokens, + reasoning_config=agent.reasoning_config, + is_oauth=agent._is_anthropic_oauth, + preserve_dots=agent._anthropic_preserve_dots(), + context_length=ctx_len, + base_url=getattr(agent, "_anthropic_base_url", None), + fast_mode=(agent.request_overrides or {}).get("speed") == "fast", + drop_context_1m_beta=bool(getattr(agent, "_oauth_1m_beta_disabled", False)), + ) + + # AWS Bedrock native Converse API — bypasses the OpenAI client entirely. + # The adapter handles message/tool conversion and boto3 calls directly. + if agent.api_mode == "bedrock_converse": + _bt = agent._get_transport() + region = getattr(agent, "_bedrock_region", None) or "us-east-1" + guardrail = getattr(agent, "_bedrock_guardrail_config", None) + return _bt.build_kwargs( + model=agent.model, + messages=api_messages, + tools=tools_for_api, + max_tokens=agent.max_tokens or 4096, + region=region, + guardrail_config=guardrail, + ) + + if agent.api_mode == "codex_responses": + _ct = agent._get_transport() + is_github_responses = ( + base_url_host_matches(agent.base_url, "models.github.ai") + or base_url_host_matches(agent.base_url, "api.githubcopilot.com") + ) + is_codex_backend = ( + agent.provider == "openai-codex" + or ( + agent._base_url_hostname == "chatgpt.com" + and "/backend-api/codex" in agent._base_url_lower + ) + ) + is_xai_responses = agent.provider in {"xai", "xai-oauth"} or agent._base_url_hostname == "api.x.ai" + _msgs_for_codex = agent._prepare_messages_for_non_vision_model(api_messages) + return _ct.build_kwargs( + model=agent.model, + messages=_msgs_for_codex, + tools=tools_for_api, + reasoning_config=agent.reasoning_config, + session_id=getattr(agent, "session_id", None), + max_tokens=agent.max_tokens, + request_overrides=agent.request_overrides, + is_github_responses=is_github_responses, + is_codex_backend=is_codex_backend, + is_xai_responses=is_xai_responses, + github_reasoning_extra=agent._github_models_reasoning_extra_body() if is_github_responses else None, + ) + + # ── chat_completions (default) ───────────────────────────────────── + _ct = agent._get_transport() + + # Provider detection flags + _is_qwen = agent._is_qwen_portal() + _is_or = agent._is_openrouter_url() + _is_gh = ( + base_url_host_matches(agent._base_url_lower, "models.github.ai") + or base_url_host_matches(agent._base_url_lower, "api.githubcopilot.com") + ) + _is_nous = "nousresearch" in agent._base_url_lower + _is_nvidia = "integrate.api.nvidia.com" in agent._base_url_lower + _is_kimi = ( + base_url_host_matches(agent.base_url, "api.kimi.com") + or base_url_host_matches(agent.base_url, "moonshot.ai") + or base_url_host_matches(agent.base_url, "moonshot.cn") + ) + _is_tokenhub = base_url_host_matches(agent._base_url_lower, "tokenhub.tencentmaas.com") + _is_lmstudio = (agent.provider or "").strip().lower() == "lmstudio" + + # Temperature: _fixed_temperature_for_model may return OMIT_TEMPERATURE + # sentinel (temperature omitted entirely), a numeric override, or None. + try: + from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE + _ft = _fixed_temperature_for_model(agent.model, agent.base_url) + _omit_temp = _ft is OMIT_TEMPERATURE + _fixed_temp = _ft if not _omit_temp else None + except Exception: + _omit_temp = False + _fixed_temp = None + + # Provider preferences (OpenRouter-style) + _prefs: Dict[str, Any] = {} + if agent.providers_allowed: + _prefs["only"] = agent.providers_allowed + if agent.providers_ignored: + _prefs["ignore"] = agent.providers_ignored + if agent.providers_order: + _prefs["order"] = agent.providers_order + if agent.provider_sort: + _prefs["sort"] = agent.provider_sort + if agent.provider_require_parameters: + _prefs["require_parameters"] = True + if agent.provider_data_collection: + _prefs["data_collection"] = agent.provider_data_collection + + # Claude max-output override on aggregators + _ant_max = None + if (_is_or or _is_nous) and "claude" in (agent.model or "").lower(): + try: + from agent.anthropic_adapter import _get_anthropic_max_output + _ant_max = _get_anthropic_max_output(agent.model) + except Exception: + pass + + # Qwen session metadata + _qwen_meta = None + if _is_qwen: + _qwen_meta = { + "sessionId": agent.session_id or "hermes", + "promptId": str(uuid.uuid4()), + } + + # ── Provider profile path (registered providers) ─────────────────── + # Profiles handle per-provider quirks via hooks. When a profile is + # found, delegate fully; otherwise fall through to the legacy flag path. + try: + from providers import get_provider_profile + _profile = get_provider_profile(agent.provider) + except Exception: + _profile = None + + if _profile: + _ephemeral_out = getattr(agent, "_ephemeral_max_output_tokens", None) + if _ephemeral_out is not None: + agent._ephemeral_max_output_tokens = None + + # Strip image parts for non-vision models that have provider profiles + # (e.g. DeepSeek, Kimi). The legacy path below already does this, but + # registered providers with profiles were bypassing the strip. + api_messages = agent._prepare_messages_for_non_vision_model(api_messages) + + return _ct.build_kwargs( + model=agent.model, + messages=api_messages, + tools=tools_for_api, + base_url=agent.base_url, + timeout=agent._resolved_api_call_timeout(), + max_tokens=agent.max_tokens, + ephemeral_max_output_tokens=_ephemeral_out, + max_tokens_param_fn=agent._max_tokens_param, + reasoning_config=agent.reasoning_config, + request_overrides=agent.request_overrides, + session_id=getattr(agent, "session_id", None), + provider_profile=_profile, + ollama_num_ctx=agent._ollama_num_ctx, + # Context forwarded to profile hooks: + provider_preferences=_prefs or None, + openrouter_min_coding_score=agent.openrouter_min_coding_score, + anthropic_max_output=_ant_max, + supports_reasoning=agent._supports_reasoning_extra_body(), + qwen_session_metadata=_qwen_meta, + ) + + # ── Legacy flag path ──────────────────────────────────────────── + # Reached only when get_provider_profile() returns None — i.e. a + # completely unknown provider not in providers/ registry. + _ephemeral_out = getattr(agent, "_ephemeral_max_output_tokens", None) + if _ephemeral_out is not None: + agent._ephemeral_max_output_tokens = None + + # Strip image parts for non-vision models (no-op when vision-capable). + _msgs_for_chat = agent._prepare_messages_for_non_vision_model(api_messages) + + return _ct.build_kwargs( + model=agent.model, + messages=_msgs_for_chat, + tools=tools_for_api, + base_url=agent.base_url, + timeout=agent._resolved_api_call_timeout(), + max_tokens=agent.max_tokens, + ephemeral_max_output_tokens=_ephemeral_out, + max_tokens_param_fn=agent._max_tokens_param, + reasoning_config=agent.reasoning_config, + request_overrides=agent.request_overrides, + session_id=getattr(agent, "session_id", None), + model_lower=(agent.model or "").lower(), + is_openrouter=_is_or, + is_nous=_is_nous, + is_qwen_portal=_is_qwen, + is_github_models=_is_gh, + is_nvidia_nim=_is_nvidia, + is_kimi=_is_kimi, + is_tokenhub=_is_tokenhub, + is_lmstudio=_is_lmstudio, + is_custom_provider=agent.provider == "custom", + ollama_num_ctx=agent._ollama_num_ctx, + provider_preferences=_prefs or None, + openrouter_min_coding_score=agent.openrouter_min_coding_score, + qwen_prepare_fn=agent._qwen_prepare_chat_messages if _is_qwen else None, + qwen_prepare_inplace_fn=agent._qwen_prepare_chat_messages_inplace if _is_qwen else None, + qwen_session_metadata=_qwen_meta, + fixed_temperature=_fixed_temp, + omit_temperature=_omit_temp, + supports_reasoning=agent._supports_reasoning_extra_body(), + github_reasoning_extra=agent._github_models_reasoning_extra_body() if _is_gh else None, + lmstudio_reasoning_options=agent._lmstudio_reasoning_options_cached() if _is_lmstudio else None, + anthropic_max_output=_ant_max, + provider_name=agent.provider, + ) + + + +def build_assistant_message(agent, assistant_message, finish_reason: str) -> dict: + """Build a normalized assistant message dict from an API response message. + + Handles reasoning extraction, reasoning_details, and optional tool_calls + so both the tool-call path and the final-response path share one builder. + """ + assistant_tool_calls = getattr(assistant_message, "tool_calls", None) + reasoning_text = agent._extract_reasoning(assistant_message) + _from_structured = bool(reasoning_text) + + # Fallback: extract inline blocks from content when no structured + # reasoning fields are present (some models/providers embed thinking + # directly in the content rather than returning separate API fields). + if not reasoning_text: + content = assistant_message.content or "" + think_blocks = re.findall(r'(.*?)', content, flags=re.DOTALL) + if think_blocks: + combined = "\n\n".join(b.strip() for b in think_blocks if b.strip()) + reasoning_text = combined or None + + if reasoning_text and agent.verbose_logging: + logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {reasoning_text}") + + if reasoning_text and agent.reasoning_callback: + # Skip callback when streaming is active — reasoning was already + # displayed during the stream via one of two paths: + # (a) _fire_reasoning_delta (structured reasoning_content deltas) + # (b) _stream_delta tag extraction (/) + # When streaming is NOT active, always fire so non-streaming modes + # (gateway, batch, quiet) still get reasoning. + # Any reasoning that wasn't shown during streaming is caught by the + # CLI post-response display fallback (cli.py _reasoning_shown_this_turn). + if not agent.stream_delta_callback and not agent._stream_callback: + try: + agent.reasoning_callback(reasoning_text) + except Exception: + pass + + # Sanitize surrogates from API response — some models (e.g. Kimi/GLM via Ollama) + # can return invalid surrogate code points that crash json.dumps() on persist. + _raw_content = assistant_message.content or "" + _san_content = _sanitize_surrogates(_raw_content) + if reasoning_text: + reasoning_text = _sanitize_surrogates(reasoning_text) + + # Strip inline reasoning tags ( etc.) from the stored + # assistant content. Reasoning was already captured into + # ``reasoning_text`` above (either from structured fields or the + # inline-block fallback), so the raw tags in content are redundant. + # Leaving them in place caused reasoning to leak to messaging + # platforms (#8878, #9568), inflate context on subsequent turns + # (#9306 observed 16% content-size reduction on a real MiniMax + # session), and pollute generated session titles. One strip at the + # storage boundary cleans content for every downstream consumer: + # API replay, session transcript, gateway delivery, CLI display, + # compression, title generation. + if isinstance(_san_content, str) and _san_content: + _san_content = agent._strip_think_blocks(_san_content).strip() + + msg = { + "role": "assistant", + "content": _san_content, + "reasoning": reasoning_text, + "finish_reason": finish_reason, + } + + raw_reasoning_content = getattr(assistant_message, "reasoning_content", None) + if raw_reasoning_content is None and hasattr(assistant_message, "model_extra"): + model_extra = getattr(assistant_message, "model_extra", None) or {} + if isinstance(model_extra, dict) and "reasoning_content" in model_extra: + raw_reasoning_content = model_extra["reasoning_content"] + if raw_reasoning_content is not None: + msg["reasoning_content"] = _sanitize_surrogates(raw_reasoning_content) + elif assistant_tool_calls and agent._needs_thinking_reasoning_pad(): + # DeepSeek v4 thinking mode and Kimi / Moonshot thinking mode + # both require reasoning_content on every assistant tool-call + # message. Without it, replaying the persisted message causes + # HTTP 400 ("The reasoning_content in the thinking mode must + # be passed back to the API"). Include streamed reasoning + # text when captured; otherwise pad with a single space — + # DeepSeek V4 Pro tightened validation and rejects empty + # string ("The reasoning content in the thinking mode must + # be passed back to the API"). A space satisfies non-empty + # checks everywhere without leaking fabricated reasoning. + # Refs #15250, #17400, #17341. + msg["reasoning_content"] = reasoning_text or " " + + # Additive fallback (refs #16844, #16884). Streaming-only providers + # (glm, MiniMax, gpt-5.x via aigw, Anthropic via openai-compat shims) + # accumulate reasoning through ``delta.reasoning_content`` chunks + # but never land it on the message object as a top-level attribute, + # so neither branch above fires and the chain-of-thought is stored + # only under the internal ``reasoning`` key. When the user later + # replays that history through a DeepSeek-v4 / Kimi thinking model, + # the missing ``reasoning_content`` causes HTTP 400 ("The + # reasoning_content in the thinking mode must be passed back to the + # API."). + # + # Promote the already-sanitized streamed ``reasoning_text`` to + # ``reasoning_content`` at write time, but ONLY when no prior branch + # already set it AND we actually captured reasoning text. This + # preserves every existing behavior: + # - SDK-exposed ``reasoning_content`` (OpenAI/Moonshot/DeepSeek SDK) + # still wins. + # - DeepSeek tool-call ""-pad (#15250) still fires. + # - Non-thinking turns with no reasoning leave the field absent, + # so ``_copy_reasoning_content_for_api``'s cross-provider leak + # guard (#15748) and ``reasoning``→``reasoning_content`` + # promotion tiers still apply at replay time. + if "reasoning_content" not in msg and reasoning_text: + msg["reasoning_content"] = reasoning_text + + if hasattr(assistant_message, 'reasoning_details') and assistant_message.reasoning_details: + # Pass reasoning_details back unmodified so providers (OpenRouter, + # Anthropic, OpenAI) can maintain reasoning continuity across turns. + # Each provider may include opaque fields (signature, encrypted_content) + # that must be preserved exactly. + raw_details = assistant_message.reasoning_details + preserved = [] + for d in raw_details: + if isinstance(d, dict): + preserved.append(d) + elif hasattr(d, "__dict__"): + preserved.append(d.__dict__) + elif hasattr(d, "model_dump"): + preserved.append(d.model_dump()) + if preserved: + msg["reasoning_details"] = preserved + + # Codex Responses API: preserve encrypted reasoning items for + # multi-turn continuity. These get replayed as input on the next turn. + codex_items = getattr(assistant_message, "codex_reasoning_items", None) + if codex_items: + msg["codex_reasoning_items"] = codex_items + + # Codex Responses API: preserve exact assistant message items (with + # id/phase) so follow-up turns can replay structured items instead of + # flattening to plain text. This is required for prefix cache hits. + codex_message_items = getattr(assistant_message, "codex_message_items", None) + if codex_message_items: + msg["codex_message_items"] = codex_message_items + + if assistant_tool_calls: + tool_calls = [] + for tool_call in assistant_tool_calls: + raw_id = getattr(tool_call, "id", None) + call_id = getattr(tool_call, "call_id", None) + if not isinstance(call_id, str) or not call_id.strip(): + embedded_call_id, _ = agent._split_responses_tool_id(raw_id) + call_id = embedded_call_id + if not isinstance(call_id, str) or not call_id.strip(): + if isinstance(raw_id, str) and raw_id.strip(): + call_id = raw_id.strip() + else: + _fn = getattr(tool_call, "function", None) + _fn_name = getattr(_fn, "name", "") if _fn else "" + _fn_args = getattr(_fn, "arguments", "{}") if _fn else "{}" + call_id = agent._deterministic_call_id(_fn_name, _fn_args, len(tool_calls)) + call_id = call_id.strip() + + response_item_id = getattr(tool_call, "response_item_id", None) + if not isinstance(response_item_id, str) or not response_item_id.strip(): + _, embedded_response_item_id = agent._split_responses_tool_id(raw_id) + response_item_id = embedded_response_item_id + + response_item_id = agent._derive_responses_function_call_id( + call_id, + response_item_id if isinstance(response_item_id, str) else None, + ) + + tc_dict = { + "id": call_id, + "call_id": call_id, + "response_item_id": response_item_id, + "type": tool_call.type, + "function": { + "name": tool_call.function.name, + "arguments": tool_call.function.arguments + }, + } + # Preserve extra_content (e.g. Gemini thought_signature) so it + # is sent back on subsequent API calls. Without this, Gemini 3 + # thinking models reject the request with a 400 error. + extra = getattr(tool_call, "extra_content", None) + if extra is not None: + if hasattr(extra, "model_dump"): + extra = extra.model_dump() + tc_dict["extra_content"] = extra + tool_calls.append(tc_dict) + msg["tool_calls"] = tool_calls + + return msg + + + +def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool: + """Switch to the next fallback model/provider in the chain. + + Called when the current model is failing after retries. Swaps the + OpenAI client, model slug, and provider in-place so the retry loop + can continue with the new backend. Advances through the chain on + each call; returns False when exhausted. + + Uses the centralized provider router (resolve_provider_client) for + auth resolution and client construction — no duplicated provider→key + mappings. + """ + if reason in {FailoverReason.rate_limit, FailoverReason.billing}: + # Only start cooldown when leaving the primary provider. If we're + # already on a fallback and chain-switching, the primary wasn't the + # source of the 429 so the cooldown should not be reset/extended. + fallback_already_active = bool(getattr(agent, "_fallback_activated", False)) + current_provider = (getattr(agent, "provider", "") or "").strip().lower() + primary_provider = ((agent._primary_runtime or {}).get("provider") or "").strip().lower() + if (not fallback_already_active) or (primary_provider and current_provider == primary_provider): + agent._rate_limited_until = time.monotonic() + 60 + if agent._fallback_index >= len(agent._fallback_chain): + return False + + fb = agent._fallback_chain[agent._fallback_index] + agent._fallback_index += 1 + fb_provider = (fb.get("provider") or "").strip().lower() + fb_model = (fb.get("model") or "").strip() + if not fb_provider or not fb_model: + return agent._try_activate_fallback() # skip invalid, try next + + # Skip entries that resolve to the current (provider, model) — falling + # back to the same backend that just failed loops the failure. Compare + # base_url too so two distinct custom_providers entries pointing at the + # same shim/proxy URL also dedup. See issue #22548. + current_provider = (getattr(agent, "provider", "") or "").strip().lower() + current_model = (getattr(agent, "model", "") or "").strip() + current_base_url = str(getattr(agent, "base_url", "") or "").rstrip("/").lower() + fb_base_url_for_dedup = (fb.get("base_url") or "").strip().rstrip("/").lower() + if fb_provider == current_provider and fb_model == current_model: + logging.warning( + "Fallback skip: chain entry %s/%s matches current provider/model", + fb_provider, fb_model, + ) + return agent._try_activate_fallback() + if ( + fb_base_url_for_dedup + and current_base_url + and fb_base_url_for_dedup == current_base_url + and fb_model == current_model + ): + logging.warning( + "Fallback skip: chain entry base_url %s matches current backend", + fb_base_url_for_dedup, + ) + return agent._try_activate_fallback() + + # Use centralized router for client construction. + # raw_codex=True because the main agent needs direct responses.stream() + # access for Codex providers. + try: + from agent.auxiliary_client import resolve_provider_client + # Pass base_url and api_key from fallback config so custom + # endpoints (e.g. Ollama Cloud) resolve correctly instead of + # falling through to OpenRouter defaults. + fb_base_url_hint = (fb.get("base_url") or "").strip() or None + fb_api_key_hint = (fb.get("api_key") or "").strip() or None + if not fb_api_key_hint: + # key_env and api_key_env are both documented aliases (see + # _normalize_custom_provider_entry in hermes_cli/config.py). + fb_key_env = (fb.get("key_env") or fb.get("api_key_env") or "").strip() + if fb_key_env: + fb_api_key_hint = os.getenv(fb_key_env, "").strip() or None + # For Ollama Cloud endpoints, pull OLLAMA_API_KEY from env + # when no explicit key is in the fallback config. Host match + # (not substring) — see GHSA-76xc-57q6-vm5m. + if fb_base_url_hint and base_url_host_matches(fb_base_url_hint, "ollama.com") and not fb_api_key_hint: + fb_api_key_hint = os.getenv("OLLAMA_API_KEY") or None + fb_client, _resolved_fb_model = resolve_provider_client( + fb_provider, model=fb_model, raw_codex=True, + explicit_base_url=fb_base_url_hint, + explicit_api_key=fb_api_key_hint) + if fb_client is None: + logging.warning( + "Fallback to %s failed: provider not configured", + fb_provider) + return agent._try_activate_fallback() # try next in chain + try: + from hermes_cli.model_normalize import normalize_model_for_provider + + fb_model = normalize_model_for_provider(fb_model, fb_provider) + except Exception: + pass + + # Determine api_mode from provider / base URL / model + fb_api_mode = "chat_completions" + fb_base_url = str(fb_client.base_url) + _fb_is_azure = agent._is_azure_openai_url(fb_base_url) + if fb_provider == "openai-codex": + fb_api_mode = "codex_responses" + elif fb_provider == "anthropic" or fb_base_url.rstrip("/").lower().endswith("/anthropic"): + fb_api_mode = "anthropic_messages" + elif _fb_is_azure: + # Azure OpenAI serves gpt-5.x on /chat/completions — does NOT + # support the Responses API. Stay on chat_completions. + fb_api_mode = "chat_completions" + elif agent._is_direct_openai_url(fb_base_url): + fb_api_mode = "codex_responses" + elif agent._provider_model_requires_responses_api( + fb_model, + provider=fb_provider, + ): + # GPT-5.x models usually need Responses API, but keep + # provider-specific exceptions like Copilot gpt-5-mini on + # chat completions. + fb_api_mode = "codex_responses" + elif fb_provider == "bedrock" or ( + base_url_hostname(fb_base_url).startswith("bedrock-runtime.") + and base_url_host_matches(fb_base_url, "amazonaws.com") + ): + fb_api_mode = "bedrock_converse" + + old_model = agent.model + + # Clear the per-config context_length override so the fallback + # model's actual context window is resolved instead of inheriting + # the stale value from the previous model. See #22387. + agent._config_context_length = None + agent.model = fb_model + agent.provider = fb_provider + agent.base_url = fb_base_url + agent.api_mode = fb_api_mode + if hasattr(agent, "_transport_cache"): + agent._transport_cache.clear() + agent._fallback_activated = True + + # Honor per-provider / per-model request_timeout_seconds for the + # fallback target (same knob the primary client uses). None = use + # SDK default. + _fb_timeout = get_provider_request_timeout(fb_provider, fb_model) + + if fb_api_mode == "anthropic_messages": + # Build native Anthropic client instead of using OpenAI client + from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token, _is_oauth_token + effective_key = (fb_client.api_key or resolve_anthropic_token() or "") if fb_provider == "anthropic" else (fb_client.api_key or "") + agent.api_key = effective_key + agent._anthropic_api_key = effective_key + agent._anthropic_base_url = fb_base_url + agent._anthropic_client = build_anthropic_client( + effective_key, agent._anthropic_base_url, timeout=_fb_timeout, + ) + agent._is_anthropic_oauth = _is_oauth_token(effective_key) if fb_provider == "anthropic" else False + agent.client = None + agent._client_kwargs = {} + else: + # Swap OpenAI client and config in-place + agent.api_key = fb_client.api_key + agent.client = fb_client + # Preserve provider-specific headers that + # resolve_provider_client() may have baked into + # fb_client via the default_headers kwarg. The OpenAI + # SDK stores these in _custom_headers. Without this, + # subsequent request-client rebuilds (via + # _create_request_openai_client) drop the headers, + # causing 403s from providers like Kimi Coding that + # require a User-Agent sentinel. + fb_headers = getattr(fb_client, "_custom_headers", None) + if not fb_headers: + fb_headers = getattr(fb_client, "default_headers", None) + agent._client_kwargs = { + "api_key": fb_client.api_key, + "base_url": fb_base_url, + **({"default_headers": dict(fb_headers)} if fb_headers else {}), + } + if _fb_timeout is not None: + agent._client_kwargs["timeout"] = _fb_timeout + # Rebuild the shared OpenAI client so the configured + # timeout takes effect on the very next fallback request, + # not only after a later credential-rotation rebuild. + agent._replace_primary_openai_client(reason="fallback_timeout_apply") + + # Re-evaluate prompt caching for the new provider/model + agent._use_prompt_caching, agent._use_native_cache_layout = ( + agent._anthropic_prompt_cache_policy( + provider=fb_provider, + base_url=fb_base_url, + api_mode=fb_api_mode, + model=fb_model, + ) + ) + + # LM Studio: preload before probing the fallback's context length. + agent._ensure_lmstudio_runtime_loaded() + + # Update context compressor limits for the fallback model. + # Without this, compression decisions use the primary model's + # context window (e.g. 200K) instead of the fallback's (e.g. 32K), + # causing oversized sessions to overflow the fallback. + # Also pass _config_context_length so the explicit config override + # (model.context_length in config.yaml) is respected — without this, + # the fallback activation drops to 128K even when config says 204800. + if hasattr(agent, 'context_compressor') and agent.context_compressor: + from agent.model_metadata import get_model_context_length + fb_context_length = get_model_context_length( + agent.model, base_url=agent.base_url, + api_key=agent.api_key, provider=agent.provider, + config_context_length=getattr(agent, "_config_context_length", None), + custom_providers=getattr(agent, "_custom_providers", None), + ) + agent.context_compressor.update_model( + model=agent.model, + context_length=fb_context_length, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + provider=agent.provider, + ) + + agent._emit_status( + f"🔄 Primary model failed — switching to fallback: " + f"{fb_model} via {fb_provider}" + ) + logging.info( + "Fallback activated: %s → %s (%s)", + old_model, fb_model, fb_provider, + ) + return True + except Exception as e: + logging.error("Failed to activate fallback %s: %s", fb_model, e) + return agent._try_activate_fallback() # try next in chain + + + +def handle_max_iterations(agent, messages: list, api_call_count: int) -> str: + """Request a summary when max iterations are reached. Returns the final response text.""" + print(f"⚠️ Reached maximum iterations ({agent.max_iterations}). Requesting summary...") + + summary_request = ( + "You've reached the maximum number of tool-calling iterations allowed. " + "Please provide a final response summarizing what you've found and accomplished so far, " + "without calling any more tools." + ) + messages.append({"role": "user", "content": summary_request}) + + try: + # Build API messages, stripping internal-only fields + # (finish_reason, reasoning) that strict APIs like Mistral reject with 422 + _needs_sanitize = agent._should_sanitize_tool_calls() + api_messages = [] + for msg in messages: + api_msg = msg.copy() + agent._copy_reasoning_content_for_api(msg, api_msg) + for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"): + api_msg.pop(internal_field, None) + if _needs_sanitize: + agent._sanitize_tool_calls_for_strict_api(api_msg) + api_messages.append(api_msg) + + effective_system = agent._cached_system_prompt or "" + if agent.ephemeral_system_prompt: + effective_system = (effective_system + "\n\n" + agent.ephemeral_system_prompt).strip() + if effective_system: + api_messages = [{"role": "system", "content": effective_system}] + api_messages + if agent.prefill_messages: + sys_offset = 1 if effective_system else 0 + for idx, pfm in enumerate(agent.prefill_messages): + api_messages.insert(sys_offset + idx, pfm.copy()) + + # Same safety net as the main loop: repair tool-call/result + # pairing before asking for a final summary. Compression and + # session resume can leave a tool result whose parent assistant + # tool_call was summarized away; Responses API rejects that as + # "No tool call found for function call output". + api_messages = agent._sanitize_api_messages(api_messages) + + # Same safety net as the main loop: drop thinking-only assistant + # turns so Anthropic-family providers don't 400 the summary call. + api_messages = agent._drop_thinking_only_and_merge_users(api_messages) + + summary_extra_body = {} + try: + from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE as _OMIT_TEMP + except Exception: + _fixed_temperature_for_model = None + _OMIT_TEMP = None + _raw_summary_temp = ( + _fixed_temperature_for_model(agent.model, agent.base_url) + if _fixed_temperature_for_model is not None + else None + ) + _omit_summary_temperature = _raw_summary_temp is _OMIT_TEMP + _summary_temperature = None if _omit_summary_temperature else _raw_summary_temp + _is_nous = "nousresearch" in agent._base_url_lower + # LM Studio uses top-level `reasoning_effort` (not extra_body.reasoning). + # Mirror ChatCompletionsTransport.build_kwargs() so the summary path + # — which calls chat.completions.create() directly without going + # through the transport — sends the same shape the transport does. + _is_lmstudio_summary = ( + (agent.provider or "").strip().lower() == "lmstudio" + and agent._supports_reasoning_extra_body() + ) + _lm_reasoning_effort: str | None = ( + agent._resolve_lmstudio_summary_reasoning_effort() + if _is_lmstudio_summary else None + ) + if not _is_lmstudio_summary and agent._supports_reasoning_extra_body(): + if agent.reasoning_config is not None: + summary_extra_body["reasoning"] = agent.reasoning_config + else: + summary_extra_body["reasoning"] = { + "enabled": True, + "effort": "medium" + } + if _is_nous: + from agent.portal_tags import nous_portal_tags as _portal_tags + summary_extra_body["tags"] = _portal_tags() + + if agent.api_mode == "codex_responses": + codex_kwargs = agent._build_api_kwargs(api_messages) + codex_kwargs.pop("tools", None) + summary_response = agent._run_codex_stream(codex_kwargs) + _ct_sum = agent._get_transport() + _cnr_sum = _ct_sum.normalize_response(summary_response) + final_response = (_cnr_sum.content or "").strip() + else: + summary_kwargs = { + "model": agent.model, + "messages": api_messages, + } + if _summary_temperature is not None: + summary_kwargs["temperature"] = _summary_temperature + if agent.max_tokens is not None: + summary_kwargs.update(agent._max_tokens_param(agent.max_tokens)) + if _lm_reasoning_effort is not None: + summary_kwargs["reasoning_effort"] = _lm_reasoning_effort + + # Include provider routing preferences + provider_preferences = {} + if agent.providers_allowed: + provider_preferences["only"] = agent.providers_allowed + if agent.providers_ignored: + provider_preferences["ignore"] = agent.providers_ignored + if agent.providers_order: + provider_preferences["order"] = agent.providers_order + if agent.provider_sort: + provider_preferences["sort"] = agent.provider_sort + if provider_preferences and ( + (agent.provider or "").strip().lower() == "openrouter" + or agent._is_openrouter_url() + ): + summary_extra_body["provider"] = provider_preferences + + # Pareto Code router plugin — model-gated. Same shape as + # the main-loop emission so summary calls on + # openrouter/pareto-code respect the user's coding-score floor. + if ( + agent.model == "openrouter/pareto-code" + and ( + (agent.provider or "").strip().lower() == "openrouter" + or agent._is_openrouter_url() + ) + and agent.openrouter_min_coding_score is not None + and agent.openrouter_min_coding_score != "" + ): + try: + _ps = float(agent.openrouter_min_coding_score) + except (TypeError, ValueError): + _ps = None + if _ps is not None and 0.0 <= _ps <= 1.0: + summary_extra_body["plugins"] = [ + {"id": "pareto-router", "min_coding_score": _ps} + ] + + if summary_extra_body: + summary_kwargs["extra_body"] = summary_extra_body + + if agent.api_mode == "anthropic_messages": + _tsum = agent._get_transport() + _ant_kw = _tsum.build_kwargs(model=agent.model, messages=api_messages, tools=None, + max_tokens=agent.max_tokens, reasoning_config=agent.reasoning_config, + is_oauth=agent._is_anthropic_oauth, + preserve_dots=agent._anthropic_preserve_dots()) + summary_response = agent._anthropic_messages_create(_ant_kw) + _summary_result = _tsum.normalize_response(summary_response, strip_tool_prefix=agent._is_anthropic_oauth) + final_response = (_summary_result.content or "").strip() + else: + summary_response = agent._ensure_primary_openai_client(reason="iteration_limit_summary").chat.completions.create(**summary_kwargs) + _summary_result = agent._get_transport().normalize_response(summary_response) + final_response = (_summary_result.content or "").strip() + + if final_response: + if "" in final_response: + final_response = re.sub(r'.*?\s*', '', final_response, flags=re.DOTALL).strip() + if final_response: + messages.append({"role": "assistant", "content": final_response}) + else: + final_response = "I reached the iteration limit and couldn't generate a summary." + else: + # Retry summary generation + if agent.api_mode == "codex_responses": + codex_kwargs = agent._build_api_kwargs(api_messages) + codex_kwargs.pop("tools", None) + retry_response = agent._run_codex_stream(codex_kwargs) + _ct_retry = agent._get_transport() + _cnr_retry = _ct_retry.normalize_response(retry_response) + final_response = (_cnr_retry.content or "").strip() + elif agent.api_mode == "anthropic_messages": + _tretry = agent._get_transport() + _ant_kw2 = _tretry.build_kwargs(model=agent.model, messages=api_messages, tools=None, + is_oauth=agent._is_anthropic_oauth, + max_tokens=agent.max_tokens, reasoning_config=agent.reasoning_config, + preserve_dots=agent._anthropic_preserve_dots()) + retry_response = agent._anthropic_messages_create(_ant_kw2) + _retry_result = _tretry.normalize_response(retry_response, strip_tool_prefix=agent._is_anthropic_oauth) + final_response = (_retry_result.content or "").strip() + else: + summary_kwargs = { + "model": agent.model, + "messages": api_messages, + } + if _summary_temperature is not None: + summary_kwargs["temperature"] = _summary_temperature + if agent.max_tokens is not None: + summary_kwargs.update(agent._max_tokens_param(agent.max_tokens)) + if _lm_reasoning_effort is not None: + summary_kwargs["reasoning_effort"] = _lm_reasoning_effort + if summary_extra_body: + summary_kwargs["extra_body"] = summary_extra_body + + summary_response = agent._ensure_primary_openai_client(reason="iteration_limit_summary_retry").chat.completions.create(**summary_kwargs) + _retry_result = agent._get_transport().normalize_response(summary_response) + final_response = (_retry_result.content or "").strip() + + if final_response: + if "" in final_response: + final_response = re.sub(r'.*?\s*', '', final_response, flags=re.DOTALL).strip() + if final_response: + messages.append({"role": "assistant", "content": final_response}) + else: + final_response = "I reached the iteration limit and couldn't generate a summary." + else: + final_response = "I reached the iteration limit and couldn't generate a summary." + + except Exception as e: + logging.warning(f"Failed to get summary response: {e}") + final_response = f"I reached the maximum iterations ({agent.max_iterations}) but couldn't summarize. Error: {str(e)}" + + return final_response + + + +def cleanup_task_resources(agent, task_id: str) -> None: + """Clean up VM and browser resources for a given task. + + Skips ``cleanup_vm`` when the active terminal environment is marked + persistent (``persistent_filesystem=True``) so that long-lived sandbox + containers survive between turns. The idle reaper in + ``terminal_tool._cleanup_inactive_envs`` still tears them down once + ``terminal.lifetime_seconds`` is exceeded. Non-persistent backends are + torn down per-turn as before to prevent resource leakage (the original + intent of this hook for the Morph backend, see commit fbd3a2fd). + """ + try: + if is_persistent_env(task_id): + if agent.verbose_logging: + logging.debug( + f"Skipping per-turn cleanup_vm for persistent env {task_id}; " + f"idle reaper will handle it." + ) + else: + _ra().cleanup_vm(task_id) + except Exception as e: + if agent.verbose_logging: + logging.warning(f"Failed to cleanup VM for task {task_id}: {e}") + try: + _ra().cleanup_browser(task_id) + except Exception as e: + if agent.verbose_logging: + logging.warning(f"Failed to cleanup browser for task {task_id}: {e}") + + + + +def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=None): + """Streaming variant of _interruptible_api_call for real-time token delivery. + + Handles all three api_modes: + - chat_completions: stream=True on OpenAI-compatible endpoints + - anthropic_messages: client.messages.stream() via Anthropic SDK + - codex_responses: delegates to _run_codex_stream (already streaming) + + Fires stream_delta_callback and _stream_callback for each text token. + Tool-call turns suppress the callback — only text-only final responses + stream to the consumer. Returns a SimpleNamespace that mimics the + non-streaming response shape so the rest of the agent loop is unchanged. + + Falls back to _interruptible_api_call on provider errors indicating + streaming is not supported. + """ + if agent._interrupt_requested: + raise InterruptedError("Agent interrupted before streaming API call") + + if agent.api_mode == "codex_responses": + # Codex streams internally via _run_codex_stream. The main dispatch + # in _interruptible_api_call already calls it; we just need to + # ensure on_first_delta reaches it. Store it on the instance + # temporarily so _run_codex_stream can pick it up. + agent._codex_on_first_delta = on_first_delta + try: + return agent._interruptible_api_call(api_kwargs) + finally: + agent._codex_on_first_delta = None + + # Bedrock Converse uses boto3's converse_stream() with real-time delta + # callbacks — same UX as Anthropic and chat_completions streaming. + if agent.api_mode == "bedrock_converse": + result = {"response": None, "error": None} + first_delta_fired = {"done": False} + deltas_were_sent = {"yes": False} + + def _fire_first(): + if not first_delta_fired["done"] and on_first_delta: + first_delta_fired["done"] = True + try: + on_first_delta() + except Exception: + pass + + def _bedrock_call(): + try: + from agent.bedrock_adapter import ( + _get_bedrock_runtime_client, + invalidate_runtime_client, + is_stale_connection_error, + stream_converse_with_callbacks, + ) + region = api_kwargs.pop("__bedrock_region__", "us-east-1") + api_kwargs.pop("__bedrock_converse__", None) + client = _get_bedrock_runtime_client(region) + try: + raw_response = client.converse_stream(**api_kwargs) + except Exception as _bedrock_exc: + # Evict the cached client on stale-connection failures + # so the outer retry loop builds a fresh client/pool. + if is_stale_connection_error(_bedrock_exc): + invalidate_runtime_client(region) + raise + + def _on_text(text): + _fire_first() + agent._fire_stream_delta(text) + deltas_were_sent["yes"] = True + + def _on_tool(name): + _fire_first() + agent._fire_tool_gen_started(name) + + def _on_reasoning(text): + _fire_first() + agent._fire_reasoning_delta(text) + + result["response"] = stream_converse_with_callbacks( + raw_response, + on_text_delta=_on_text if agent._has_stream_consumers() else None, + on_tool_start=_on_tool, + on_reasoning_delta=_on_reasoning if agent.reasoning_callback or agent.stream_delta_callback else None, + on_interrupt_check=lambda: agent._interrupt_requested, + ) + except Exception as e: + result["error"] = e + + t = threading.Thread(target=_bedrock_call, daemon=True) + t.start() + while t.is_alive(): + t.join(timeout=0.3) + if agent._interrupt_requested: + raise InterruptedError("Agent interrupted during Bedrock API call") + if result["error"] is not None: + raise result["error"] + return result["response"] + + result = {"response": None, "error": None, "partial_tool_names": []} + request_client_holder = {"client": None, "diag": None} + first_delta_fired = {"done": False} + deltas_were_sent = {"yes": False} # Track if any deltas were fired (for fallback) + # Wall-clock timestamp of the last real streaming chunk. The outer + # poll loop uses this to detect stale connections that keep receiving + # SSE keep-alive pings but no actual data. + last_chunk_time = {"t": time.time()} + + def _fire_first_delta(): + if not first_delta_fired["done"] and on_first_delta: + first_delta_fired["done"] = True + try: + on_first_delta() + except Exception: + pass + + def _call_chat_completions(): + """Stream a chat completions response.""" + import httpx as _httpx + # Per-provider / per-model request_timeout_seconds (from config.yaml) + # wins over the HERMES_API_TIMEOUT env default if the user set it. + _provider_timeout_cfg = get_provider_request_timeout(agent.provider, agent.model) + _base_timeout = ( + _provider_timeout_cfg + if _provider_timeout_cfg is not None + else float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) + ) + # Read timeout: config wins here too. Otherwise use + # HERMES_STREAM_READ_TIMEOUT (default 120s) for cloud providers. + if _provider_timeout_cfg is not None: + _stream_read_timeout = _provider_timeout_cfg + else: + _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0)) + # Local providers (Ollama, llama.cpp, vLLM) can take minutes for + # prefill on large contexts before producing the first token. + # Auto-increase the httpx read timeout unless the user explicitly + # overrode HERMES_STREAM_READ_TIMEOUT. + if _stream_read_timeout == 120.0 and agent.base_url and is_local_endpoint(agent.base_url): + _stream_read_timeout = _base_timeout + logger.debug( + "Local provider detected (%s) — stream read timeout raised to %.0fs", + agent.base_url, _stream_read_timeout, + ) + stream_kwargs = { + **api_kwargs, + "stream": True, + "stream_options": {"include_usage": True}, + "timeout": _httpx.Timeout( + connect=30.0, + read=_stream_read_timeout, + write=_base_timeout, + pool=30.0, + ), + } + request_client_holder["client"] = agent._create_request_openai_client( + reason="chat_completion_stream_request", + api_kwargs=stream_kwargs, + ) + # Reset stale-stream timer so the detector measures from this + # attempt's start, not a previous attempt's last chunk. + last_chunk_time["t"] = time.time() + agent._touch_activity("waiting for provider response (streaming)") + # Initialize per-attempt stream diagnostics so the retry block can + # reach for them after the stream dies. Lives on + # ``request_client_holder["diag"]`` for closure access. + _diag = agent._stream_diag_init() + request_client_holder["diag"] = _diag + stream = request_client_holder["client"].chat.completions.create(**stream_kwargs) + + # Capture rate limit headers from the initial HTTP response. + # The OpenAI SDK Stream object exposes the underlying httpx + # response via .response before any chunks are consumed. + agent._capture_rate_limits(getattr(stream, "response", None)) + # Snapshot diagnostic headers (cf-ray, x-openrouter-provider, etc.) + # so they survive even when the stream dies before any chunk + # arrives. Best-effort; never raises. + agent._stream_diag_capture_response(_diag, getattr(stream, "response", None)) + + # Log OpenRouter response cache status when present. + agent._check_openrouter_cache_status(getattr(stream, "response", None)) + + content_parts: list = [] + tool_calls_acc: dict = {} + tool_gen_notified: set = set() + # Ollama-compatible endpoints reuse index 0 for every tool call + # in a parallel batch, distinguishing them only by id. Track + # the last seen id per raw index so we can detect a new tool + # call starting at the same index and redirect it to a fresh slot. + _last_id_at_idx: dict = {} # raw_index -> last seen non-empty id + _active_slot_by_idx: dict = {} # raw_index -> current slot in tool_calls_acc + finish_reason = None + model_name = None + role = "assistant" + reasoning_parts: list = [] + usage_obj = None + for chunk in stream: + last_chunk_time["t"] = time.time() + agent._touch_activity("receiving stream response") + + # Update per-attempt diagnostic counters. Best-effort — + # failures are swallowed so the streaming hot path is never + # interrupted by diagnostic accounting. + try: + _diag["chunks"] = int(_diag.get("chunks", 0)) + 1 + if _diag.get("first_chunk_at") is None: + _diag["first_chunk_at"] = last_chunk_time["t"] + # Approximate byte size from the chunk's repr — exact wire + # bytes aren't exposed by the SDK, but len(repr(chunk)) is + # a stable proxy for "how much content arrived" that + # survives stub provider differences. + try: + _diag["bytes"] = int(_diag.get("bytes", 0)) + len(repr(chunk)) + except Exception: + pass + except Exception: + pass + + if agent._interrupt_requested: + break + + if not chunk.choices: + if hasattr(chunk, "model") and chunk.model: + model_name = chunk.model + # Usage comes in the final chunk with empty choices + if hasattr(chunk, "usage") and chunk.usage: + usage_obj = chunk.usage + continue + + delta = chunk.choices[0].delta + if hasattr(chunk, "model") and chunk.model: + model_name = chunk.model + + # Accumulate reasoning content + reasoning_text = getattr(delta, "reasoning_content", None) or getattr(delta, "reasoning", None) + if reasoning_text: + reasoning_parts.append(reasoning_text) + _fire_first_delta() + agent._fire_reasoning_delta(reasoning_text) + + # Accumulate text content — fire callback only when no tool calls + if delta and delta.content: + content_parts.append(delta.content) + if not tool_calls_acc: + _fire_first_delta() + agent._fire_stream_delta(delta.content) + deltas_were_sent["yes"] = True + # Tool calls suppress regular content streaming (avoids + # displaying chatty "I'll use the tool..." text alongside + # tool calls). But reasoning tags embedded in suppressed + # content should still reach the display — otherwise the + # reasoning box only appears as a post-response fallback, + # rendering it confusingly after the already-streamed + # response. Route suppressed content through the stream + # delta callback so its tag extraction can fire the + # reasoning display. Non-reasoning text is harmlessly + # suppressed by the CLI's _stream_delta when the stream + # box is already closed (tool boundary flush). + elif agent.stream_delta_callback: + try: + agent.stream_delta_callback(delta.content) + agent._record_streamed_assistant_text(delta.content) + except Exception: + pass + + # Accumulate tool call deltas — notify display on first name + if delta and delta.tool_calls: + for tc_delta in delta.tool_calls: + raw_idx = tc_delta.index if tc_delta.index is not None else 0 + delta_id = tc_delta.id or "" + + # Ollama fix: detect a new tool call reusing the same + # raw index (different id) and redirect to a fresh slot. + if raw_idx not in _active_slot_by_idx: + _active_slot_by_idx[raw_idx] = raw_idx + if ( + delta_id + and raw_idx in _last_id_at_idx + and delta_id != _last_id_at_idx[raw_idx] + ): + new_slot = max(tool_calls_acc, default=-1) + 1 + _active_slot_by_idx[raw_idx] = new_slot + if delta_id: + _last_id_at_idx[raw_idx] = delta_id + idx = _active_slot_by_idx[raw_idx] + + if idx not in tool_calls_acc: + tool_calls_acc[idx] = { + "id": tc_delta.id or "", + "type": "function", + "function": {"name": "", "arguments": ""}, + "extra_content": None, + } + entry = tool_calls_acc[idx] + if tc_delta.id: + entry["id"] = tc_delta.id + if tc_delta.function: + if tc_delta.function.name: + # Use assignment, not +=. Function names are + # atomic identifiers delivered complete in the + # first chunk (OpenAI spec). Some providers + # (MiniMax M2.7 via NVIDIA NIM) resend the full + # name in every chunk; concatenation would + # produce "read_fileread_file". Assignment + # (matching the OpenAI Node SDK / LiteLLM / + # Vercel AI patterns) is immune to this. + entry["function"]["name"] = tc_delta.function.name + if tc_delta.function.arguments: + entry["function"]["arguments"] += tc_delta.function.arguments + extra = getattr(tc_delta, "extra_content", None) + if extra is None and hasattr(tc_delta, "model_extra"): + extra = (tc_delta.model_extra or {}).get("extra_content") + if extra is not None: + if hasattr(extra, "model_dump"): + extra = extra.model_dump() + entry["extra_content"] = extra + # Fire once per tool when the full name is available + name = entry["function"]["name"] + if name and idx not in tool_gen_notified: + tool_gen_notified.add(idx) + _fire_first_delta() + agent._fire_tool_gen_started(name) + # Record the partial tool-call name so the outer + # stub-builder can surface a user-visible warning + # if streaming dies before this tool's arguments + # are fully delivered. Without this, a stall + # during tool-call JSON generation lets the stub + # at line ~6107 return `tool_calls=None`, silently + # discarding the attempted action. + result["partial_tool_names"].append(name) + + if chunk.choices[0].finish_reason: + finish_reason = chunk.choices[0].finish_reason + + # Usage in the final chunk + if hasattr(chunk, "usage") and chunk.usage: + usage_obj = chunk.usage + + # Build mock response matching non-streaming shape + full_content = "".join(content_parts) or None + mock_tool_calls = None + has_truncated_tool_args = False + if tool_calls_acc: + mock_tool_calls = [] + for idx in sorted(tool_calls_acc): + tc = tool_calls_acc[idx] + arguments = tc["function"]["arguments"] + tool_name = tc["function"]["name"] or "?" + if arguments and arguments.strip(): + try: + json.loads(arguments) + except json.JSONDecodeError: + # Attempt repair before flagging as truncated. + # Models like GLM-5.1 via Ollama produce trailing + # commas, unclosed brackets, Python None, etc. + # Without repair, these hit the truncation handler + # and kill the session. _repair_tool_call_arguments + # returns "{}" for unrepairable args, which is far + # better than a crashed session. + repaired = _repair_tool_call_arguments(arguments, tool_name) + if repaired != "{}": + # Successfully repaired — use the fixed args + arguments = repaired + else: + # Unrepairable — flag for truncation handling + has_truncated_tool_args = True + mock_tool_calls.append(SimpleNamespace( + id=tc["id"], + type=tc["type"], + extra_content=tc.get("extra_content"), + function=SimpleNamespace( + name=tc["function"]["name"], + arguments=arguments, + ), + )) + + effective_finish_reason = finish_reason or "stop" + if has_truncated_tool_args: + effective_finish_reason = "length" + + full_reasoning = "".join(reasoning_parts) or None + mock_message = SimpleNamespace( + role=role, + content=full_content, + tool_calls=mock_tool_calls, + reasoning_content=full_reasoning, + ) + mock_choice = SimpleNamespace( + index=0, + message=mock_message, + finish_reason=effective_finish_reason, + ) + return SimpleNamespace( + id="stream-" + str(uuid.uuid4()), + model=model_name, + choices=[mock_choice], + usage=usage_obj, + ) + + def _call_anthropic(): + """Stream an Anthropic Messages API response. + + Fires delta callbacks for real-time token delivery, but returns + the native Anthropic Message object from get_final_message() so + the rest of the agent loop (validation, tool extraction, etc.) + works unchanged. + """ + has_tool_use = False + + # Reset stale-stream timer for this attempt + last_chunk_time["t"] = time.time() + # Per-attempt diagnostic dict for the retry block to consume. + _diag = agent._stream_diag_init() + request_client_holder["diag"] = _diag + # Use the Anthropic SDK's streaming context manager + with agent._anthropic_client.messages.stream(**api_kwargs) as stream: + # The Anthropic SDK exposes the raw httpx response on + # ``stream.response``. Snapshot diagnostic headers + # immediately so they survive a stream that dies before the + # first event. + try: + agent._stream_diag_capture_response( + _diag, getattr(stream, "response", None) + ) + except Exception: + pass + for event in stream: + # Update stale-stream timer on every event so the + # outer poll loop knows data is flowing. Without + # this, the detector kills healthy long-running + # Opus streams after 180 s even when events are + # actively arriving (the chat_completions path + # already does this at the top of its chunk loop). + last_chunk_time["t"] = time.time() + agent._touch_activity("receiving stream response") + + # Update per-attempt diagnostic counters (best-effort). + try: + _diag["chunks"] = int(_diag.get("chunks", 0)) + 1 + if _diag.get("first_chunk_at") is None: + _diag["first_chunk_at"] = last_chunk_time["t"] + try: + _diag["bytes"] = int(_diag.get("bytes", 0)) + len(repr(event)) + except Exception: + pass + except Exception: + pass + + if agent._interrupt_requested: + break + + event_type = getattr(event, "type", None) + + if event_type == "content_block_start": + block = getattr(event, "content_block", None) + if block and getattr(block, "type", None) == "tool_use": + has_tool_use = True + tool_name = getattr(block, "name", None) + if tool_name: + _fire_first_delta() + agent._fire_tool_gen_started(tool_name) + + elif event_type == "content_block_delta": + delta = getattr(event, "delta", None) + if delta: + delta_type = getattr(delta, "type", None) + if delta_type == "text_delta": + text = getattr(delta, "text", "") + if text and not has_tool_use: + _fire_first_delta() + agent._fire_stream_delta(text) + deltas_were_sent["yes"] = True + elif delta_type == "thinking_delta": + thinking_text = getattr(delta, "thinking", "") + if thinking_text: + _fire_first_delta() + agent._fire_reasoning_delta(thinking_text) + + # Return the native Anthropic Message for downstream processing + return stream.get_final_message() + + def _call(): + import httpx as _httpx + + _max_stream_retries = int(os.getenv("HERMES_STREAM_RETRIES", 2)) + + try: + for _stream_attempt in range(_max_stream_retries + 1): + # Check for interrupt before each retry attempt. Without + # this, /stop closes the HTTP connection (outer poll loop), + # but the retry loop opens a FRESH connection — negating the + # interrupt entirely. On slow providers (ollama-cloud) each + # retry can block for the full stream-read timeout (120s+), + # causing multi-minute delays between /stop and response. + if agent._interrupt_requested: + raise InterruptedError("Agent interrupted before stream retry") + try: + if agent.api_mode == "anthropic_messages": + agent._try_refresh_anthropic_client_credentials() + result["response"] = _call_anthropic() + else: + result["response"] = _call_chat_completions() + return # success + except Exception as e: + _is_timeout = isinstance( + e, (_httpx.ReadTimeout, _httpx.ConnectTimeout, _httpx.PoolTimeout) + ) + _is_conn_err = isinstance( + e, (_httpx.ConnectError, _httpx.RemoteProtocolError, ConnectionError) + ) + _is_stream_parse_err = agent._is_provider_stream_parse_error(e) + + # If the stream died AFTER some tokens were delivered: + # normally we don't retry (the user already saw text, + # retrying would duplicate it). BUT: if a tool call + # was in-flight when the stream died, silently aborting + # discards the tool call entirely. In that case we + # prefer to retry — the user sees a brief + # "reconnecting" marker + duplicated preamble text, + # which is strictly better than a failed action with + # a "retry manually" message. Limit this to transient + # connection errors (Clawdbot-style narrow gate): no + # tool has executed yet within this API call, so + # silent retry is safe wrt side-effects. + if deltas_were_sent["yes"]: + _partial_tool_in_flight = bool( + result.get("partial_tool_names") + ) + _is_sse_conn_err_preview = False + if not _is_timeout and not _is_conn_err: + from openai import APIError as _APIError + if isinstance(e, _APIError) and not getattr(e, "status_code", None): + _err_lower_preview = str(e).lower() + _SSE_PREVIEW_PHRASES = ( + "connection lost", + "connection reset", + "connection closed", + "connection terminated", + "network error", + "network connection", + "terminated", + "peer closed", + "broken pipe", + "upstream connect error", + ) + _is_sse_conn_err_preview = any( + phrase in _err_lower_preview + for phrase in _SSE_PREVIEW_PHRASES + ) + _is_transient = ( + _is_timeout + or _is_conn_err + or _is_sse_conn_err_preview + or _is_stream_parse_err + ) + _can_silent_retry = ( + _partial_tool_in_flight + and _is_transient + and _stream_attempt < _max_stream_retries + ) + if not _can_silent_retry: + # Either no tool call was in-flight (so the + # turn was a pure text response — current + # stub-with-recovered-text behaviour is + # correct), or retries are exhausted, or the + # error isn't transient. Fall through to the + # stub path. + logger.warning( + "Streaming failed after partial delivery, not retrying: %s", e + ) + result["error"] = e + return + # Tool call was in-flight AND error is transient: + # retry silently. Clear per-attempt state so the + # next stream starts clean. Fire a "reconnecting" + # marker so the user sees why the preamble is + # about to be re-streamed. Structured WARNING is + # emitted by ``_emit_stream_drop`` below; no + # additional INFO line needed. + try: + agent._fire_stream_delta( + "\n\n⚠ Connection dropped mid tool-call; " + "reconnecting…\n\n" + ) + except Exception: + pass + # Reset the streamed-text buffer so the retry's + # fresh preamble doesn't get double-recorded in + # _current_streamed_assistant_text (which would + # pollute the interim-visible-text comparison). + try: + agent._reset_stream_delivery_tracking() + except Exception: + pass + # Reset in-memory accumulators so the next + # attempt's chunks don't concat onto the dead + # stream's partial JSON. + result["partial_tool_names"] = [] + deltas_were_sent["yes"] = False + first_delta_fired["done"] = False + agent._emit_stream_drop( + error=e, + attempt=_stream_attempt + 2, + max_attempts=_max_stream_retries + 1, + mid_tool_call=True, + diag=request_client_holder.get("diag"), + ) + stale = request_client_holder.get("client") + if stale is not None: + agent._close_request_openai_client( + stale, reason="stream_mid_tool_retry_cleanup" + ) + request_client_holder["client"] = None + try: + agent._replace_primary_openai_client( + reason="stream_mid_tool_retry_pool_cleanup" + ) + except Exception: + pass + continue + + # SSE error events from proxies (e.g. OpenRouter sends + # {"error":{"message":"Network connection lost."}}) are + # raised as APIError by the OpenAI SDK. These are + # semantically identical to httpx connection drops — + # the upstream stream died — and should be retried with + # a fresh connection. Distinguish from HTTP errors: + # APIError from SSE has no status_code, while + # APIStatusError (4xx/5xx) always has one. + _is_sse_conn_err = False + if not _is_timeout and not _is_conn_err: + from openai import APIError as _APIError + if isinstance(e, _APIError) and not getattr(e, "status_code", None): + _err_lower_sse = str(e).lower() + _SSE_CONN_PHRASES = ( + "connection lost", + "connection reset", + "connection closed", + "connection terminated", + "network error", + "network connection", + "terminated", + "peer closed", + "broken pipe", + "upstream connect error", + ) + _is_sse_conn_err = any( + phrase in _err_lower_sse + for phrase in _SSE_CONN_PHRASES + ) + + if _is_timeout or _is_conn_err or _is_sse_conn_err or _is_stream_parse_err: + # Transient network / timeout error. Retry the + # streaming request with a fresh connection first. + if _stream_attempt < _max_stream_retries: + agent._emit_stream_drop( + error=e, + attempt=_stream_attempt + 2, + max_attempts=_max_stream_retries + 1, + mid_tool_call=False, + diag=request_client_holder.get("diag"), + ) + # Close the stale request client before retry + stale = request_client_holder.get("client") + if stale is not None: + agent._close_request_openai_client( + stale, reason="stream_retry_cleanup" + ) + request_client_holder["client"] = None + # Also rebuild the primary client to purge + # any dead connections from the pool. + try: + agent._replace_primary_openai_client( + reason="stream_retry_pool_cleanup" + ) + except Exception: + pass + continue + # Retries exhausted. Log the final failure with + # full diagnostic detail (chain, headers, + # bytes/elapsed) via the same helper used for + # mid-flight retries — subagent lines get the + # ``[subagent-N]`` log_prefix so the parent can + # attribute them. + agent._log_stream_retry( + kind="exhausted", + error=e, + attempt=_max_stream_retries + 1, + max_attempts=_max_stream_retries + 1, + mid_tool_call=False, + diag=request_client_holder.get("diag"), + ) + agent._emit_status( + "❌ Provider returned malformed streaming data after " + f"{_max_stream_retries + 1} attempts. " + "The provider may be experiencing issues — " + "try again in a moment." + if _is_stream_parse_err else + "❌ Connection to provider failed after " + f"{_max_stream_retries + 1} attempts. " + "The provider may be experiencing issues — " + "try again in a moment." + ) + else: + _err_lower = str(e).lower() + _is_stream_unsupported = ( + "stream" in _err_lower + and "not supported" in _err_lower + ) + if _is_stream_unsupported: + agent._disable_streaming = True + agent._safe_print( + "\n⚠ Streaming is not supported for this " + "model/provider. Switching to non-streaming.\n" + " To avoid this delay, set display.streaming: false " + "in config.yaml\n" + ) + logger.info( + "Streaming failed before delivery: %s", + e, + ) + + # Propagate the error to the main retry loop instead of + # falling back to non-streaming inline. The main loop has + # richer recovery: credential rotation, provider fallback, + # backoff, and — for "stream not supported" — will switch + # to non-streaming on the next attempt via _disable_streaming. + result["error"] = e + return + except InterruptedError as e: + # The interrupt may be noticed inside the worker thread before + # the polling loop sees it. Surface it through the normal result + # channel so callers never miss a fast pre-retry interrupt. + result["error"] = e + return + finally: + request_client = request_client_holder.get("client") + if request_client is not None: + agent._close_request_openai_client(request_client, reason="stream_request_complete") + + _stream_stale_timeout_base = float(os.getenv("HERMES_STREAM_STALE_TIMEOUT", 180.0)) + # Local providers (Ollama, oMLX, llama-cpp) can take 300+ seconds + # for prefill on large contexts. Disable the stale detector unless + # the user explicitly set HERMES_STREAM_STALE_TIMEOUT. + if _stream_stale_timeout_base == 180.0 and agent.base_url and is_local_endpoint(agent.base_url): + _stream_stale_timeout = float("inf") + logger.debug("Local provider detected (%s) — stale stream timeout disabled", agent.base_url) + else: + # Scale the stale timeout for large contexts: slow models (like Opus) + # can legitimately think for minutes before producing the first token + # when the context is large. Without this, the stale detector kills + # healthy connections during the model's thinking phase, producing + # spurious RemoteProtocolError ("peer closed connection"). + _est_tokens = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4 + if _est_tokens > 100_000: + _stream_stale_timeout = max(_stream_stale_timeout_base, 300.0) + elif _est_tokens > 50_000: + _stream_stale_timeout = max(_stream_stale_timeout_base, 240.0) + else: + _stream_stale_timeout = _stream_stale_timeout_base + + t = threading.Thread(target=_call, daemon=True) + t.start() + _last_heartbeat = time.time() + _HEARTBEAT_INTERVAL = 30.0 # seconds between gateway activity touches + while t.is_alive(): + t.join(timeout=0.3) + + # Periodic heartbeat: touch the agent's activity tracker so the + # gateway's inactivity monitor knows we're alive while waiting + # for stream chunks. Without this, long thinking pauses (e.g. + # reasoning models) or slow prefill on local providers (Ollama) + # trigger false inactivity timeouts. The _call thread touches + # activity on each chunk, but the gap between API call start + # and first chunk can exceed the gateway timeout — especially + # when the stale-stream timeout is disabled (local providers). + _hb_now = time.time() + if _hb_now - _last_heartbeat >= _HEARTBEAT_INTERVAL: + _last_heartbeat = _hb_now + _waiting_secs = int(_hb_now - last_chunk_time["t"]) + agent._touch_activity( + f"waiting for stream response ({_waiting_secs}s, no chunks yet)" + ) + + # Detect stale streams: connections kept alive by SSE pings + # but delivering no real chunks. Kill the client so the + # inner retry loop can start a fresh connection. + _stale_elapsed = time.time() - last_chunk_time["t"] + if _stale_elapsed > _stream_stale_timeout: + _est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4 + logger.warning( + "Stream stale for %.0fs (threshold %.0fs) — no chunks received. " + "model=%s context=~%s tokens. Killing connection.", + _stale_elapsed, _stream_stale_timeout, + api_kwargs.get("model", "unknown"), f"{_est_ctx:,}", + ) + agent._emit_status( + f"⚠️ No response from provider for {int(_stale_elapsed)}s " + f"(model: {api_kwargs.get('model', 'unknown')}, " + f"context: ~{_est_ctx:,} tokens). " + f"Reconnecting..." + ) + try: + rc = request_client_holder.get("client") + if rc is not None: + agent._close_request_openai_client(rc, reason="stale_stream_kill") + except Exception: + pass + # Rebuild the primary client too — its connection pool + # may hold dead sockets from the same provider outage. + try: + agent._replace_primary_openai_client(reason="stale_stream_pool_cleanup") + except Exception: + pass + # Reset the timer so we don't kill repeatedly while + # the inner thread processes the closure. + last_chunk_time["t"] = time.time() + agent._touch_activity( + f"stale stream detected after {int(_stale_elapsed)}s, reconnecting" + ) + + if agent._interrupt_requested: + try: + if agent.api_mode == "anthropic_messages": + agent._anthropic_client.close() + agent._rebuild_anthropic_client() + else: + request_client = request_client_holder.get("client") + if request_client is not None: + agent._close_request_openai_client(request_client, reason="stream_interrupt_abort") + except Exception: + pass + raise InterruptedError("Agent interrupted during streaming API call") + if result["error"] is not None: + if deltas_were_sent["yes"]: + # Streaming failed AFTER some tokens were already delivered to + # the platform. Re-raising would let the outer retry loop make + # a new API call, creating a duplicate message. Return a + # partial "stop" response instead so the outer loop treats this + # turn as complete (no retry, no fallback). + # Recover whatever content was already streamed to the user. + # _current_streamed_assistant_text accumulates text fired + # through _fire_stream_delta, so it has exactly what the + # user saw before the connection died. + _partial_text = ( + getattr(agent, "_current_streamed_assistant_text", "") or "" + ).strip() or None + + # If the stream died while the model was emitting a tool call, + # the stub below will silently set `tool_calls=None` and the + # agent loop will treat the turn as complete — the attempted + # action is lost with no user-facing signal. Append a + # human-visible warning to the stub content so (a) the user + # knows something failed, and (b) the next turn's model sees + # in conversation history what was attempted and can retry. + _partial_names = list(result.get("partial_tool_names") or []) + if _partial_names: + _name_str = ", ".join(_partial_names[:3]) + if len(_partial_names) > 3: + _name_str += f", +{len(_partial_names) - 3} more" + _warn = ( + f"\n\n⚠ Stream stalled mid tool-call " + f"({_name_str}); the action was not executed. " + f"Ask me to retry if you want to continue." + ) + _partial_text = (_partial_text or "") + _warn + # Also fire as a streaming delta so the user sees it now + # instead of only in the persisted transcript. + try: + agent._fire_stream_delta(_warn) + except Exception: + pass + logger.warning( + "Partial stream dropped tool call(s) %s after %s chars " + "of text; surfaced warning to user: %s", + _partial_names, len(_partial_text or ""), result["error"], + ) + else: + logger.warning( + "Partial stream delivered before error; returning stub " + "response with %s chars of recovered content to prevent " + "duplicate messages: %s", + len(_partial_text or ""), + result["error"], + ) + _stub_msg = SimpleNamespace( + role="assistant", content=_partial_text, tool_calls=None, + reasoning_content=None, + ) + return SimpleNamespace( + id="partial-stream-stub", + model=getattr(agent, "model", "unknown"), + choices=[SimpleNamespace( + index=0, message=_stub_msg, finish_reason="stop", + )], + usage=None, + ) + raise result["error"] + return result["response"] + +# ── Provider fallback ────────────────────────────────────────────────── + + + +__all__ = [ + "interruptible_api_call", + "build_api_kwargs", + "build_assistant_message", + "try_activate_fallback", + "handle_max_iterations", + "cleanup_task_resources", + "interruptible_streaming_api_call", +] diff --git a/agent/codex_runtime.py b/agent/codex_runtime.py new file mode 100644 index 00000000000..02b788f5777 --- /dev/null +++ b/agent/codex_runtime.py @@ -0,0 +1,448 @@ +"""Codex API runtime — App Server and Responses-API streaming paths. + +Extracted from :class:`AIAgent` to keep the agent loop file focused. +Each function takes the parent ``AIAgent`` as its first argument +(``agent``). AIAgent keeps thin forwarder methods for backward +compatibility. + +* ``run_codex_app_server_turn`` — drives one turn through the + ``codex_app_server`` subprocess client (used when a Codex CLI install + is the active provider). +* ``run_codex_stream`` — streams a Codex Responses API call (the + ``codex_responses`` api_mode). +* ``run_codex_create_stream_fallback`` — recovery path when the + Responses ``stream=True`` initial create fails. +""" + +from __future__ import annotations + +import json +import logging +import os +from types import SimpleNamespace +from typing import Any, Dict, List + +logger = logging.getLogger(__name__) + + +def run_codex_app_server_turn( + agent, + *, + user_message: str, + original_user_message: Any, + messages: List[Dict[str, Any]], + effective_task_id: str, + should_review_memory: bool = False, +) -> Dict[str, Any]: + """Codex app-server runtime path. Hands the entire turn to a `codex + app-server` subprocess and projects its events back into Hermes' + messages list so memory/skill review keep working. + + Called from run_conversation() when agent.api_mode == "codex_app_server". + Returns the same dict shape as the chat_completions path. + """ + from agent.transports.codex_app_server_session import CodexAppServerSession + + # Lazy session: one CodexAppServerSession per AIAgent instance. + # Spawned on first turn, reused across turns, closed at AIAgent + # shutdown (see _cleanup hook). + if not hasattr(agent, "_codex_session") or agent._codex_session is None: + cwd = getattr(agent, "session_cwd", None) or os.getcwd() + # Approval callback: defer to Hermes' standard prompt flow if a + # CLI thread has installed one. Gateway / cron contexts get the + # codex-side fail-closed default. + try: + from tools.terminal_tool import _get_approval_callback + approval_callback = _get_approval_callback() + except Exception: + approval_callback = None + agent._codex_session = CodexAppServerSession( + cwd=cwd, + approval_callback=approval_callback, + ) + + # NOTE: the user message is ALREADY appended to messages by the + # standard run_conversation() flow (line ~11823) before the early + # return reaches us. Do NOT append again — that would duplicate. + + try: + turn = agent._codex_session.run_turn(user_input=user_message) + except Exception as exc: + logger.exception("codex app-server turn failed") + # Crash → unconditionally drop the session so the next turn + # respawns from scratch instead of reusing a dead client. + try: + agent._codex_session.close() + except Exception: + pass + agent._codex_session = None + return { + "final_response": ( + f"Codex app-server turn failed: {exc}. " + f"Fall back to default runtime with `/codex-runtime auto`." + ), + "messages": messages, + "api_calls": 0, + "completed": False, + "partial": True, + "error": str(exc), + } + + # If the turn signalled the underlying client is wedged (deadline + # blown, post-tool watchdog tripped, OAuth refresh died, subprocess + # exited), retire the session so the next turn respawns codex + # rather than riding the broken process. Mirrors openclaw beta.8's + # "retire timed-out app-server clients" fix. + if getattr(turn, "should_retire", False): + logger.warning( + "codex app-server session retired (turn error: %s)", + turn.error, + ) + try: + agent._codex_session.close() + except Exception: + pass + agent._codex_session = None + + # Splice projected messages into the conversation. The projector emits + # standard {role, content, tool_calls, tool_call_id} entries, which + # is exactly what curator.py / sessions DB expect. + if turn.projected_messages: + messages.extend(turn.projected_messages) + + # Counter ticks for the agent-improvement loop. + # _turns_since_memory and _user_turn_count are ALREADY incremented + # in the run_conversation() pre-loop block (lines ~11793-11817) so we + # do NOT touch them here — that would double-count. + # Only _iters_since_skill needs explicit increment, since the + # chat_completions loop bumps it per tool iteration (line ~12110) + # and that loop is bypassed on this path. + agent._iters_since_skill = ( + getattr(agent, "_iters_since_skill", 0) + turn.tool_iterations + ) + + # Now check the skill nudge AFTER iters were incremented — same + # pattern the chat_completions path uses (line ~15432). + should_review_skills = False + if ( + agent._skill_nudge_interval > 0 + and agent._iters_since_skill >= agent._skill_nudge_interval + and "skill_manage" in agent.valid_tool_names + ): + should_review_skills = True + agent._iters_since_skill = 0 + + # External memory provider sync (mirrors line ~15439). Skipped on + # interrupt/error to avoid feeding partial transcripts to memory. + if not turn.interrupted and turn.error is None: + try: + agent._sync_external_memory_for_turn( + original_user_message=original_user_message, + final_response=turn.final_text, + interrupted=False, + ) + except Exception: + logger.debug("external memory sync raised", exc_info=True) + + # Background review fork — same cadence + signature as the default + # path (line ~15449). Only fires when a trigger actually tripped AND + # we have a real final response. + if ( + turn.final_text + and not turn.interrupted + and (should_review_memory or should_review_skills) + ): + try: + agent._spawn_background_review( + messages_snapshot=list(messages), + review_memory=should_review_memory, + review_skills=should_review_skills, + ) + except Exception: + logger.debug("background review spawn raised", exc_info=True) + + return { + "final_response": turn.final_text, + "messages": messages, + "api_calls": 1, # one app-server "turn" maps to one logical API call + "completed": not turn.interrupted and turn.error is None, + "partial": turn.interrupted or turn.error is not None, + "error": turn.error, + "codex_thread_id": turn.thread_id, + "codex_turn_id": turn.turn_id, + } + + + + +def run_codex_stream(agent, api_kwargs: dict, client: Any = None, on_first_delta: callable = None): + """Execute one streaming Responses API request and return the final response.""" + import httpx as _httpx + + active_client = client or agent._ensure_primary_openai_client(reason="codex_stream_direct") + max_stream_retries = 1 + has_tool_calls = False + first_delta_fired = False + # Accumulate streamed text so we can recover if get_final_response() + # returns empty output (e.g. chatgpt.com backend-api sends + # response.incomplete instead of response.completed). + agent._codex_streamed_text_parts: list = [] + for attempt in range(max_stream_retries + 1): + if agent._interrupt_requested: + raise InterruptedError("Agent interrupted before Codex stream retry") + collected_output_items: list = [] + try: + with active_client.responses.stream(**api_kwargs) as stream: + for event in stream: + agent._touch_activity("receiving stream response") + if agent._interrupt_requested: + break + event_type = getattr(event, "type", "") + # Fire callbacks on text content deltas (suppress during tool calls) + if "output_text.delta" in event_type or event_type == "response.output_text.delta": + delta_text = getattr(event, "delta", "") + if delta_text: + agent._codex_streamed_text_parts.append(delta_text) + if delta_text and not has_tool_calls: + if not first_delta_fired: + first_delta_fired = True + if on_first_delta: + try: + on_first_delta() + except Exception: + pass + agent._fire_stream_delta(delta_text) + # Track tool calls to suppress text streaming + elif "function_call" in event_type: + has_tool_calls = True + # Fire reasoning callbacks + elif "reasoning" in event_type and "delta" in event_type: + reasoning_text = getattr(event, "delta", "") + if reasoning_text: + agent._fire_reasoning_delta(reasoning_text) + # Collect completed output items — some backends + # (chatgpt.com/backend-api/codex) stream valid items + # via response.output_item.done but the SDK's + # get_final_response() returns an empty output list. + elif event_type == "response.output_item.done": + done_item = getattr(event, "item", None) + if done_item is not None: + collected_output_items.append(done_item) + # Log non-completed terminal events for diagnostics + elif event_type in {"response.incomplete", "response.failed"}: + resp_obj = getattr(event, "response", None) + status = getattr(resp_obj, "status", None) if resp_obj else None + incomplete_details = getattr(resp_obj, "incomplete_details", None) if resp_obj else None + logger.warning( + "Codex Responses stream received terminal event %s " + "(status=%s, incomplete_details=%s, streamed_chars=%d). %s", + event_type, status, incomplete_details, + sum(len(p) for p in agent._codex_streamed_text_parts), + agent._client_log_context(), + ) + final_response = stream.get_final_response() + # PATCH: ChatGPT Codex backend streams valid output items + # but get_final_response() can return an empty output list. + # Backfill from collected items or synthesize from deltas. + _out = getattr(final_response, "output", None) + if isinstance(_out, list) and not _out: + if collected_output_items: + final_response.output = list(collected_output_items) + logger.debug( + "Codex stream: backfilled %d output items from stream events", + len(collected_output_items), + ) + elif agent._codex_streamed_text_parts and not has_tool_calls: + assembled = "".join(agent._codex_streamed_text_parts) + final_response.output = [SimpleNamespace( + type="message", + role="assistant", + status="completed", + content=[SimpleNamespace(type="output_text", text=assembled)], + )] + logger.debug( + "Codex stream: synthesized output from %d text deltas (%d chars)", + len(agent._codex_streamed_text_parts), len(assembled), + ) + return final_response + except (_httpx.RemoteProtocolError, _httpx.ReadTimeout, _httpx.ConnectError, ConnectionError) as exc: + if attempt < max_stream_retries: + logger.debug( + "Codex Responses stream transport failed (attempt %s/%s); retrying. %s error=%s", + attempt + 1, + max_stream_retries + 1, + agent._client_log_context(), + exc, + ) + continue + logger.debug( + "Codex Responses stream transport failed; falling back to create(stream=True). %s error=%s", + agent._client_log_context(), + exc, + ) + return agent._run_codex_create_stream_fallback(api_kwargs, client=active_client) + except RuntimeError as exc: + err_text = str(exc) + missing_completed = "response.completed" in err_text + # The OpenAI SDK's Responses streaming state machine raises + # ``RuntimeError("Expected to have received `response.created` + # before ``")`` when the first SSE event from the + # server is anything other than ``response.created`` — and it + # discards the event's payload before we can read it. Three + # real-world backends emit a different first frame: + # + # * xAI on grok-4.x OAuth — sends ``error`` (issues + # reported around the May 2026 SuperGrok rollout when + # multi-turn conversations replay encrypted reasoning + # content the OAuth tier rejects) + # * codex-lb relays — send ``codex.rate_limits`` (#14634) + # * custom Responses relays — send ``response.in_progress`` + # (#8133) + # + # In all three cases the underlying byte stream is still + # readable: a non-stream ``responses.create(stream=True)`` + # fallback succeeds and surfaces the real provider error as + # a normal exception with body+status_code attached, which + # ``_summarize_api_error`` can then translate into a useful + # user-facing line. Treat ``response.created`` prelude + # errors the same way we already treat ``response.completed`` + # postlude errors. + prelude_error = ( + "Expected to have received `response.created`" in err_text + or "Expected to have received \"response.created\"" in err_text + ) + if (missing_completed or prelude_error) and attempt < max_stream_retries: + logger.debug( + "Responses stream %s (attempt %s/%s); retrying. %s", + "prelude rejected" if prelude_error else "closed before completion", + attempt + 1, + max_stream_retries + 1, + agent._client_log_context(), + ) + continue + if missing_completed or prelude_error: + logger.debug( + "Responses stream %s; falling back to create(stream=True). %s err=%s", + "rejected before response.created" if prelude_error else "did not emit response.completed", + agent._client_log_context(), + err_text, + ) + return agent._run_codex_create_stream_fallback(api_kwargs, client=active_client) + raise + + + +def run_codex_create_stream_fallback(agent, api_kwargs: dict, client: Any = None): + """Fallback path for stream completion edge cases on Codex-style Responses backends.""" + active_client = client or agent._ensure_primary_openai_client(reason="codex_create_stream_fallback") + fallback_kwargs = dict(api_kwargs) + fallback_kwargs["stream"] = True + fallback_kwargs = agent._get_transport().preflight_kwargs(fallback_kwargs, allow_stream=True) + stream_or_response = active_client.responses.create(**fallback_kwargs) + + # Compatibility shim for mocks or providers that still return a concrete response. + if hasattr(stream_or_response, "output"): + return stream_or_response + if not hasattr(stream_or_response, "__iter__"): + return stream_or_response + + terminal_response = None + collected_output_items: list = [] + collected_text_deltas: list = [] + try: + for event in stream_or_response: + agent._touch_activity("receiving stream response") + event_type = getattr(event, "type", None) + if not event_type and isinstance(event, dict): + event_type = event.get("type") + + # ``error`` SSE frames carry the provider's real failure + # reason (subscription / quota / model-not-available / + # rejected-reasoning-replay) but never appear in the + # ``{completed, incomplete, failed}`` terminal set, so the + # raw loop below would silently consume them and end with + # "did not emit a terminal response". xAI in particular + # emits ``type=error`` as the FIRST frame for OAuth + # accounts whose Grok subscription is missing/exhausted — + # the SDK's stream helper raises ``RuntimeError(Expected + # to have received response.created before error)`` which + # the caller catches and routes here, expecting this + # fallback to surface the message. Synthesize an + # APIError-shaped exception so ``_summarize_api_error`` + # and the credential-pool entitlement detector see the + # real text instead of a generic RuntimeError. + if event_type == "error": + err_message = getattr(event, "message", None) + if not err_message and isinstance(event, dict): + err_message = event.get("message") + err_code = getattr(event, "code", None) + if not err_code and isinstance(event, dict): + err_code = event.get("code") + err_param = getattr(event, "param", None) + if not err_param and isinstance(event, dict): + err_param = event.get("param") + err_message = (err_message or "stream emitted error event").strip() + from run_agent import _StreamErrorEvent + raise _StreamErrorEvent(err_message, code=err_code, param=err_param) + + # Collect output items and text deltas for backfill + if event_type == "response.output_item.done": + done_item = getattr(event, "item", None) + if done_item is None and isinstance(event, dict): + done_item = event.get("item") + if done_item is not None: + collected_output_items.append(done_item) + elif event_type in {"response.output_text.delta",}: + delta = getattr(event, "delta", "") + if not delta and isinstance(event, dict): + delta = event.get("delta", "") + if delta: + collected_text_deltas.append(delta) + + if event_type not in {"response.completed", "response.incomplete", "response.failed"}: + continue + + terminal_response = getattr(event, "response", None) + if terminal_response is None and isinstance(event, dict): + terminal_response = event.get("response") + if terminal_response is not None: + # Backfill empty output from collected stream events + _out = getattr(terminal_response, "output", None) + if isinstance(_out, list) and not _out: + if collected_output_items: + terminal_response.output = list(collected_output_items) + logger.debug( + "Codex fallback stream: backfilled %d output items", + len(collected_output_items), + ) + elif collected_text_deltas: + assembled = "".join(collected_text_deltas) + terminal_response.output = [SimpleNamespace( + type="message", role="assistant", + status="completed", + content=[SimpleNamespace(type="output_text", text=assembled)], + )] + logger.debug( + "Codex fallback stream: synthesized from %d deltas (%d chars)", + len(collected_text_deltas), len(assembled), + ) + return terminal_response + finally: + close_fn = getattr(stream_or_response, "close", None) + if callable(close_fn): + try: + close_fn() + except Exception: + pass + + if terminal_response is not None: + return terminal_response + raise RuntimeError("Responses create(stream=True) fallback did not emit a terminal response.") + + + +__all__ = [ + "run_codex_app_server_turn", + "run_codex_stream", + "run_codex_create_stream_fallback", +] diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py new file mode 100644 index 00000000000..bc70623997d --- /dev/null +++ b/agent/conversation_compression.py @@ -0,0 +1,556 @@ +"""Context compression — extract the AIAgent methods that drive summarisation. + +Three concerns live here: + +* :func:`check_compression_model_feasibility` — startup probe of the + configured auxiliary compression model. Warns when the aux context + window can't fit the main model's compression threshold; auto-lowers + the session threshold when possible; hard-rejects auxes below + ``MINIMUM_CONTEXT_LENGTH``. + +* :func:`replay_compression_warning` — re-emit a stored warning through + the gateway ``status_callback`` once it's wired up (the callback is + set after :class:`AIAgent` construction). + +* :func:`compress_context` — the actual compression call. Runs the + configured compressor, splits the SQLite session, rotates the + session_id, notifies plugin context engines / memory providers, and + returns the compressed message list and freshly-built system prompt. + +* :func:`try_shrink_image_parts_in_messages` — image-too-large recovery + helper that re-encodes ``data:image/...;base64,...`` parts at a smaller + size so retries can fit under provider ceilings (Anthropic's 5 MB). + +``run_agent`` keeps thin wrappers for each so existing call sites +(``self._compress_context(...)``) keep working. Tests that exercise +these paths see no behavioural change. +""" + +from __future__ import annotations + +import logging +import os +import tempfile +import uuid +from datetime import datetime +from pathlib import Path +from typing import Any, List, Optional, Tuple + +from agent.model_metadata import estimate_request_tokens_rough + +logger = logging.getLogger(__name__) + + +def check_compression_model_feasibility(agent: Any) -> None: + """Warn at session start if the auxiliary compression model's context + window is smaller than the main model's compression threshold. + + When the auxiliary model cannot fit the content that needs summarising, + compression will either fail outright (the LLM call errors) or produce + a severely truncated summary. + + Called during ``AIAgent.__init__`` so CLI users see the warning + immediately (via ``_vprint``). The gateway sets ``status_callback`` + *after* construction, so :func:`replay_compression_warning` re-sends + the stored warning through the callback on the first + ``run_conversation()`` call. + """ + if not agent.compression_enabled: + return + try: + from agent.auxiliary_client import ( + _resolve_task_provider_model, + get_text_auxiliary_client, + ) + from agent.model_metadata import ( + MINIMUM_CONTEXT_LENGTH, + get_model_context_length, + ) + + client, aux_model = get_text_auxiliary_client( + "compression", + main_runtime=agent._current_main_runtime(), + ) + # Best-effort aux provider label for the warning message. The + # configured provider may be "auto", in which case we fall back + # to the client's base_url hostname so the user can still tell + # where the compression model is actually being called. + try: + _aux_cfg_provider, _, _, _, _ = _resolve_task_provider_model("compression") + except Exception: + _aux_cfg_provider = "" + if client is None or not aux_model: + if _aux_cfg_provider and _aux_cfg_provider != "auto": + msg = ( + "⚠ Configured auxiliary compression provider " + f"'{_aux_cfg_provider}' is unavailable — context " + "compression will drop middle turns without a summary. " + "Check auxiliary.compression in config.yaml and " + "reauthenticate that provider." + ) + else: + msg = ( + "⚠ No auxiliary LLM provider configured — context " + "compression will drop middle turns without a summary. " + "Run `hermes setup` or set OPENROUTER_API_KEY." + ) + agent._compression_warning = msg + agent._emit_status(msg) + logger.warning( + "No auxiliary LLM provider for compression — " + "summaries will be unavailable." + ) + return + + aux_base_url = str(getattr(client, "base_url", "")) + aux_api_key = str(getattr(client, "api_key", "")) + + aux_context = get_model_context_length( + aux_model, + base_url=aux_base_url, + api_key=aux_api_key, + config_context_length=getattr(agent, "_aux_compression_context_length_config", None), + # Each model must be resolved with its own provider so that + # provider-specific paths (e.g. Bedrock static table, OpenRouter API) + # are invoked for the correct client, not inherited from the main model. + provider=(_aux_cfg_provider if _aux_cfg_provider and _aux_cfg_provider != "auto" else getattr(agent, "provider", "")), + custom_providers=agent._custom_providers, + ) + + # Hard floor: the auxiliary compression model must have at least + # MINIMUM_CONTEXT_LENGTH (64K) tokens of context. The main model + # is already required to meet this floor (checked earlier in + # __init__), so the compression model must too — otherwise it + # cannot summarise a full threshold-sized window of main-model + # content. Mirrors the main-model rejection pattern. + if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH: + raise ValueError( + f"Auxiliary compression model {aux_model} has a context " + f"window of {aux_context:,} tokens, which is below the " + f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes " + f"Agent. Choose a compression model with at least " + f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set " + f"auxiliary.compression.model in config.yaml), or set " + f"auxiliary.compression.context_length to override the " + f"detected value if it is wrong." + ) + + threshold = agent.context_compressor.threshold_tokens + if aux_context < threshold: + # Auto-correct: lower the live session threshold so + # compression actually works this session. The hard floor + # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH, + # so the new threshold is always >= 64K. + # + # The compression summariser sends a single user-role + # prompt (no system prompt, no tools) to the aux model, so + # new_threshold == aux_context is safe: the request is + # the raw messages plus a small summarisation instruction. + old_threshold = threshold + new_threshold = aux_context + agent.context_compressor.threshold_tokens = new_threshold + # Keep threshold_percent in sync so future main-model + # context_length changes (update_model) re-derive from a + # sensible number rather than the original too-high value. + main_ctx = agent.context_compressor.context_length + if main_ctx: + agent.context_compressor.threshold_percent = ( + new_threshold / main_ctx + ) + safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50 + # Build human-readable "model (provider)" labels for both + # the main model and the compression model so users can + # tell at a glance which provider each side is actually + # using. When the configured provider is empty or "auto", + # fall back to the client's base_url hostname. + _main_model = getattr(agent, "model", "") or "?" + _main_provider = getattr(agent, "provider", "") or "" + _aux_provider_label = ( + _aux_cfg_provider + if _aux_cfg_provider and _aux_cfg_provider != "auto" + else "" + ) + if not _aux_provider_label: + try: + from urllib.parse import urlparse + _aux_provider_label = ( + urlparse(aux_base_url).hostname or aux_base_url + ) + except Exception: + _aux_provider_label = aux_base_url or "auto" + _main_label = ( + f"{_main_model} ({_main_provider})" + if _main_provider + else _main_model + ) + _aux_label = f"{aux_model} ({_aux_provider_label})" + msg = ( + f"⚠ Compression model {_aux_label} context is " + f"{aux_context:,} tokens, but the main model " + f"{_main_label}'s compression threshold was " + f"{old_threshold:,} tokens. " + f"Auto-lowered this session's threshold to " + f"{new_threshold:,} tokens so compression can run.\n" + f" To make this permanent, edit config.yaml — either:\n" + f" 1. Use a larger compression model:\n" + f" auxiliary:\n" + f" compression:\n" + f" model: \n" + f" 2. Lower the compression threshold:\n" + f" compression:\n" + f" threshold: 0.{safe_pct:02d}" + ) + agent._compression_warning = msg + agent._emit_status(msg) + logger.warning( + "Auxiliary compression model %s has %d token context, " + "below the main model's compression threshold of %d " + "tokens — auto-lowered session threshold to %d to " + "keep compression working.", + aux_model, + aux_context, + old_threshold, + new_threshold, + ) + except ValueError: + # Hard rejections (aux below minimum context) must propagate + # so the session refuses to start. + raise + except Exception as exc: + logger.debug( + "Compression feasibility check failed (non-fatal): %s", exc + ) + + +def replay_compression_warning(agent: Any) -> None: + """Re-send the compression warning through ``status_callback``. + + During ``__init__`` the gateway's ``status_callback`` is not yet + wired, so ``_emit_status`` only reaches ``_vprint`` (CLI). This + method is called once at the start of the first + ``run_conversation()`` — by then the gateway has set the callback, + so every platform (Telegram, Discord, Slack, etc.) receives the + warning. + """ + msg = getattr(agent, "_compression_warning", None) + if msg and agent.status_callback: + try: + agent.status_callback("lifecycle", msg) + except Exception: + pass + + +def compress_context( + agent: Any, + messages: list, + system_message: str, + *, + approx_tokens: Optional[int] = None, + task_id: str = "default", + focus_topic: Optional[str] = None, +) -> Tuple[list, str]: + """Compress conversation context and split the session in SQLite. + + Args: + agent: The owning :class:`AIAgent`. + messages: Current message history (will be summarised). + system_message: Current system prompt; rebuilt after compression. + approx_tokens: Pre-compression token estimate, logged for ops. + task_id: Tool task scope (used for clearing file-read dedup state). + focus_topic: Optional focus string for guided compression — the + summariser will prioritise preserving information related to + this topic. Inspired by Claude Code's ``/compact ``. + + Returns: + ``(compressed_messages, new_system_prompt)`` tuple. + """ + _pre_msg_count = len(messages) + logger.info( + "context compression started: session=%s messages=%d tokens=~%s model=%s focus=%r", + agent.session_id or "none", _pre_msg_count, + f"{approx_tokens:,}" if approx_tokens else "unknown", agent.model, + focus_topic, + ) + agent._emit_status( + "🗜️ Compacting context — summarizing earlier conversation so I can continue..." + ) + + # Notify external memory provider before compression discards context + if agent._memory_manager: + try: + agent._memory_manager.on_pre_compress(messages) + except Exception: + pass + + try: + compressed = agent.context_compressor.compress(messages, current_tokens=approx_tokens, focus_topic=focus_topic) + except TypeError: + # Plugin context engine with strict signature that doesn't accept + # focus_topic — fall back to calling without it. + compressed = agent.context_compressor.compress(messages, current_tokens=approx_tokens) + + summary_error = getattr(agent.context_compressor, "_last_summary_error", None) + if summary_error: + if getattr(agent, "_last_compression_summary_warning", None) != summary_error: + agent._last_compression_summary_warning = summary_error + agent._emit_warning( + f"⚠ Compression summary failed: {summary_error}. " + "Inserted a fallback context marker." + ) + else: + # No hard failure — but did the configured aux model error out + # and get recovered by retrying on main? Surface that so users + # know their auxiliary.compression.model setting is broken even + # though compression succeeded. + _aux_fail_model = getattr(agent.context_compressor, "_last_aux_model_failure_model", None) + _aux_fail_err = getattr(agent.context_compressor, "_last_aux_model_failure_error", None) + if _aux_fail_model: + # Dedup on (model, error) so we don't spam on every compaction + _aux_key = (_aux_fail_model, _aux_fail_err) + if getattr(agent, "_last_aux_fallback_warning_key", None) != _aux_key: + agent._last_aux_fallback_warning_key = _aux_key + agent._emit_warning( + f"ℹ Configured compression model '{_aux_fail_model}' failed " + f"({_aux_fail_err or 'unknown error'}). Recovered using main model — " + "check auxiliary.compression.model in config.yaml." + ) + + todo_snapshot = agent._todo_store.format_for_injection() + if todo_snapshot: + compressed.append({"role": "user", "content": todo_snapshot}) + + agent._invalidate_system_prompt() + new_system_prompt = agent._build_system_prompt(system_message) + agent._cached_system_prompt = new_system_prompt + + if agent._session_db: + try: + # Propagate title to the new session with auto-numbering + old_title = agent._session_db.get_session_title(agent.session_id) + # Trigger memory extraction on the old session before it rotates. + agent.commit_memory_session(messages) + agent._session_db.end_session(agent.session_id, "compression") + old_session_id = agent.session_id + agent.session_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}" + os.environ["HERMES_SESSION_ID"] = agent.session_id + try: + from gateway.session_context import _SESSION_ID + _SESSION_ID.set(agent.session_id) + except Exception: + pass + # Update session_log_file to point to the new session's JSON file + agent.session_log_file = agent.logs_dir / f"session_{agent.session_id}.json" + agent._session_db_created = False + agent._session_db.create_session( + session_id=agent.session_id, + source=agent.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"), + model=agent.model, + model_config=agent._session_init_model_config, + parent_session_id=old_session_id, + ) + agent._session_db_created = True + # Auto-number the title for the continuation session + if old_title: + try: + new_title = agent._session_db.get_next_title_in_lineage(old_title) + agent._session_db.set_session_title(agent.session_id, new_title) + except (ValueError, Exception) as e: + logger.debug("Could not propagate title on compression: %s", e) + agent._session_db.update_system_prompt(agent.session_id, new_system_prompt) + # Reset flush cursor — new session starts with no messages written + agent._last_flushed_db_idx = 0 + except Exception as e: + logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e) + + # Notify the context engine that the session_id rotated because of + # compression (not a fresh /new). Plugin engines (e.g. hermes-lcm) use + # boundary_reason="compression" to preserve DAG lineage across the + # rollover instead of re-initializing fresh per-session state. + # See hermes-lcm#68. Built-in ContextCompressor ignores kwargs. + try: + _old_sid = locals().get("old_session_id") + if _old_sid and hasattr(agent.context_compressor, "on_session_start"): + agent.context_compressor.on_session_start( + agent.session_id or "", + boundary_reason="compression", + old_session_id=_old_sid, + ) + except Exception as _ce_err: + logger.debug("context engine on_session_start (compression): %s", _ce_err) + + # Notify memory providers of the compression-driven session_id rotation + # so provider-cached per-session state (Hindsight's _document_id, + # accumulated turn buffers, counters) refreshes. reset=False because + # the logical conversation continues; only the id and DB row rolled + # over. See #6672. + try: + _old_sid = locals().get("old_session_id") + if _old_sid and agent._memory_manager: + agent._memory_manager.on_session_switch( + agent.session_id or "", + parent_session_id=_old_sid, + reset=False, + reason="compression", + ) + except Exception as _me_err: + logger.debug("memory manager on_session_switch (compression): %s", _me_err) + + # Warn on repeated compressions (quality degrades with each pass) + _cc = agent.context_compressor.compression_count + if _cc >= 2: + agent._vprint( + f"{agent.log_prefix}⚠️ Session compressed {_cc} times — " + f"accuracy may degrade. Consider /new to start fresh.", + force=True, + ) + + # Update token estimate after compaction so pressure calculations + # use the post-compression count, not the stale pre-compression one. + # Use estimate_request_tokens_rough() so tool schemas are included — + # with 50+ tools enabled, schemas alone can add 20-30K tokens, and + # omitting them delays the next compression cycle far past the + # configured threshold (issue #14695). + _compressed_est = estimate_request_tokens_rough( + compressed, + system_prompt=new_system_prompt or "", + tools=agent.tools or None, + ) + agent.context_compressor.last_prompt_tokens = _compressed_est + agent.context_compressor.last_completion_tokens = 0 + + # Clear the file-read dedup cache. After compression the original + # read content is summarised away — if the model re-reads the same + # file it needs the full content, not a "file unchanged" stub. + try: + from tools.file_tools import reset_file_dedup + reset_file_dedup(task_id) + except Exception: + pass + + logger.info( + "context compression done: session=%s messages=%d->%d tokens=~%s", + agent.session_id or "none", _pre_msg_count, len(compressed), + f"{_compressed_est:,}", + ) + return compressed, new_system_prompt + + +def try_shrink_image_parts_in_messages(api_messages: list) -> bool: + """Re-encode all native image parts at a smaller size to recover from + image-too-large errors (Anthropic 5 MB, unknown other providers). + + Mutates ``api_messages`` in place. Returns True if any image part was + actually replaced, False if there were no image parts to shrink or + Pillow couldn't help (caller should surface the original error). + + Strategy: look for ``image_url`` / ``input_image`` parts carrying a + ``data:image/...;base64,...`` payload. For each one whose encoded + size exceeds 4 MB (a safe target that slides under Anthropic's 5 MB + ceiling with header overhead), write the base64 to a tempfile, call + ``vision_tools._resize_image_for_vision`` to produce a smaller data + URL, and substitute it in place. + + Non-data-URL images (http/https URLs) are not touched — the provider + fetches those itself and the size limit is different. + """ + if not api_messages: + return False + + try: + from tools.vision_tools import _resize_image_for_vision + except Exception as exc: + logger.warning("image-shrink recovery: vision_tools unavailable — %s", exc) + return False + + # 4 MB target leaves comfortable headroom under Anthropic's 5 MB. + # Non-Anthropic providers we haven't observed rejecting are fine with + # much larger; shrinking to 4 MB here loses quality but only fires + # after a confirmed provider rejection, so the alternative is failure. + target_bytes = 4 * 1024 * 1024 + changed_count = 0 + + def _shrink_data_url(url: str) -> Optional[str]: + """Return a smaller data URL, or None if shrink can't help.""" + if not isinstance(url, str) or not url.startswith("data:"): + return None + if len(url) <= target_bytes: + # This specific image wasn't the oversized one. + return None + try: + header, _, data = url.partition(",") + mime = "image/jpeg" + if header.startswith("data:"): + mime_part = header[len("data:"):].split(";", 1)[0].strip() + if mime_part.startswith("image/"): + mime = mime_part + import base64 as _b64 + raw = _b64.b64decode(data) + suffix = { + "image/png": ".png", "image/gif": ".gif", "image/webp": ".webp", + "image/jpeg": ".jpg", "image/jpg": ".jpg", "image/bmp": ".bmp", + }.get(mime, ".jpg") + tmp = tempfile.NamedTemporaryFile( + prefix="hermes_shrink_", suffix=suffix, delete=False, + ) + try: + tmp.write(raw) + tmp.close() + resized = _resize_image_for_vision( + Path(tmp.name), + mime_type=mime, + max_base64_bytes=target_bytes, + ) + finally: + try: + Path(tmp.name).unlink(missing_ok=True) + except Exception: + pass + if not resized or len(resized) >= len(url): + # Shrink didn't help (or made it bigger — corrupt input?). + return None + return resized + except Exception as exc: + logger.warning("image-shrink recovery: re-encode failed — %s", exc) + return None + + for msg in api_messages: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + for part in content: + if not isinstance(part, dict): + continue + ptype = part.get("type") + if ptype not in {"image_url", "input_image"}: + continue + image_value = part.get("image_url") + # OpenAI chat.completions: {"image_url": {"url": "data:..."}} + # OpenAI Responses: {"image_url": "data:..."} + if isinstance(image_value, dict): + url = image_value.get("url", "") + resized = _shrink_data_url(url) + if resized: + image_value["url"] = resized + changed_count += 1 + elif isinstance(image_value, str): + resized = _shrink_data_url(image_value) + if resized: + part["image_url"] = resized + changed_count += 1 + + if changed_count: + logger.info( + "image-shrink recovery: re-encoded %d image part(s) to fit under %.0f MB", + changed_count, target_bytes / (1024 * 1024), + ) + return changed_count > 0 + + +__all__ = [ + "check_compression_model_feasibility", + "replay_compression_warning", + "compress_context", + "try_shrink_image_parts_in_messages", +] diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py new file mode 100644 index 00000000000..8096b754298 --- /dev/null +++ b/agent/conversation_loop.py @@ -0,0 +1,4018 @@ +"""The agent conversation loop — extracted from ``run_agent.AIAgent``. + +This is the biggest single chunk pulled out of ``run_agent.py``: the +roughly 3,900-line :func:`run_conversation` body that drives one user +turn through the agent (model call, tool dispatch, retries, fallbacks, +compression, post-turn hooks, background memory/skill review nudges). + +The function takes the parent ``AIAgent`` instance as its first +argument (``agent``) and accesses its state via attribute lookup. +``_ra().AIAgent.run_conversation`` is now a thin forwarder. + +Symbols that production code or tests patch on ``run_agent`` directly +(``handle_function_call``, ``_set_interrupt``, ``OpenAI``, ...) are +resolved through :func:`_ra` so those patches keep working. +""" + +from __future__ import annotations + +import json +import logging +import os +import random +import re +import ssl +import threading +import time +import uuid +from typing import Any, Dict, List, Optional + +from agent.anthropic_adapter import _is_oauth_token +from agent.auxiliary_client import set_runtime_main +from agent.codex_responses_adapter import _summarize_user_message_for_log +from agent.display import KawaiiSpinner +from agent.error_classifier import FailoverReason, classify_api_error +from agent.iteration_budget import IterationBudget +from agent.memory_manager import build_memory_context_block +from agent.message_sanitization import ( + _repair_tool_call_arguments, + _sanitize_messages_non_ascii, + _sanitize_messages_surrogates, + _sanitize_structure_non_ascii, + _sanitize_structure_surrogates, + _sanitize_surrogates, + _sanitize_tools_non_ascii, + _strip_images_from_messages, + _strip_non_ascii, +) +from agent.model_metadata import ( + estimate_messages_tokens_rough, + estimate_request_tokens_rough, + get_next_probe_tier, + parse_available_output_tokens_from_error, + parse_context_limit_from_error, + save_context_length, +) +from agent.nous_rate_guard import ( + clear_nous_rate_limit, + is_genuine_nous_rate_limit, + nous_rate_limit_remaining, + record_nous_rate_limit, +) +from agent.process_bootstrap import _install_safe_stdio +from agent.prompt_caching import apply_anthropic_cache_control +from agent.retry_utils import jittered_backoff +from agent.trajectory import has_incomplete_scratchpad +from agent.usage_pricing import estimate_usage_cost, normalize_usage +from hermes_constants import display_hermes_home as _dhh_fn +from hermes_logging import set_session_context +from tools.schema_sanitizer import strip_pattern_and_format +from tools.skill_provenance import set_current_write_origin +from utils import base_url_host_matches, env_var_enabled + +logger = logging.getLogger(__name__) + + +def _ra(): + """Lazy reference to ``run_agent`` so callers can patch + ``run_agent.handle_function_call`` / ``run_agent._set_interrupt`` / + ``run_agent.OpenAI`` and have those patches reach this code path. + """ + import run_agent + return run_agent + + +def run_conversation( + agent, + user_message: str, + system_message: str = None, + conversation_history: List[Dict[str, Any]] = None, + task_id: str = None, + stream_callback: Optional[callable] = None, + persist_user_message: Optional[str] = None, +) -> Dict[str, Any]: + """ + Run a complete conversation with tool calling until completion. + + Args: + user_message (str): The user's message/question + system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided) + conversation_history (List[Dict]): Previous conversation messages (optional) + task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided) + stream_callback: Optional callback invoked with each text delta during streaming. + Used by the TTS pipeline to start audio generation before the full response. + When None (default), API calls use the standard non-streaming path. + persist_user_message: Optional clean user message to store in + transcripts/history when user_message contains API-only + synthetic prefixes. + or queuing follow-up prefetch work. + + Returns: + Dict: Complete conversation result with final response and message history + """ + # Guard stdio against OSError from broken pipes (systemd/headless/daemon). + # Installed once, transparent when streams are healthy, prevents crash on write. + _install_safe_stdio() + + agent._ensure_db_session() + + # Tell auxiliary_client what the live main provider/model are for + # this turn. Used by tools whose behaviour depends on the active + # main model (e.g. vision_analyze's native fast path) so they see + # the CLI/gateway override instead of the stale config.yaml + # default. Idempotent — fine to call every turn. + try: + from agent.auxiliary_client import set_runtime_main + set_runtime_main( + getattr(agent, "provider", "") or "", + getattr(agent, "model", "") or "", + ) + except Exception: + pass + + # Tag all log records on this thread with the session ID so + # ``hermes logs --session `` can filter a single conversation. + from hermes_logging import set_session_context + set_session_context(agent.session_id) + + # Bind the skill write-origin ContextVar for this thread so tool + # handlers (e.g. skill_manage create) can tell whether they are + # running inside the background agent-improvement review fork vs. + # a foreground user-directed turn. Set at the top of each call; + # the review fork runs on its own thread with a fresh context, + # so the foreground value here does not leak into it. + from tools.skill_provenance import set_current_write_origin + set_current_write_origin(getattr(agent, "_memory_write_origin", "assistant_tool")) + + # If the previous turn activated fallback, restore the primary + # runtime so this turn gets a fresh attempt with the preferred model. + # No-op when _fallback_activated is False (gateway, first turn, etc.). + agent._restore_primary_runtime() + + # Sanitize surrogate characters from user input. Clipboard paste from + # rich-text editors (Google Docs, Word, etc.) can inject lone surrogates + # that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK. + if isinstance(user_message, str): + user_message = _sanitize_surrogates(user_message) + if isinstance(persist_user_message, str): + persist_user_message = _sanitize_surrogates(persist_user_message) + + # Store stream callback for _interruptible_api_call to pick up + agent._stream_callback = stream_callback + agent._persist_user_message_idx = None + agent._persist_user_message_override = persist_user_message + # Generate unique task_id if not provided to isolate VMs between concurrent tasks + effective_task_id = task_id or str(uuid.uuid4()) + # Expose the active task_id so tools running mid-turn (e.g. delegate_task + # in delegate_tool.py) can identify this agent for the cross-agent file + # state registry. Set BEFORE any tool dispatch so snapshots taken at + # child-launch time see the parent's real id, not None. + agent._current_task_id = effective_task_id + + # Reset retry counters and iteration budget at the start of each turn + # so subagent usage from a previous turn doesn't eat into the next one. + agent._invalid_tool_retries = 0 + agent._invalid_json_retries = 0 + agent._empty_content_retries = 0 + agent._incomplete_scratchpad_retries = 0 + agent._codex_incomplete_retries = 0 + agent._thinking_prefill_retries = 0 + agent._post_tool_empty_retried = False + agent._last_content_with_tools = None + agent._last_content_tools_all_housekeeping = False + agent._mute_post_response = False + agent._unicode_sanitization_passes = 0 + agent._tool_guardrails.reset_for_turn() + agent._tool_guardrail_halt_decision = None + # True until the server rejects an image_url content part with an error + # like "Only 'text' content type is supported." Set to False on first + # rejection and kept False for the rest of the session so we never re-send + # images to a text-only endpoint. Scoped per `_run()` call, not per instance. + agent._vision_supported = True + + # Pre-turn connection health check: detect and clean up dead TCP + # connections left over from provider outages or dropped streams. + # This prevents the next API call from hanging on a zombie socket. + if agent.api_mode != "anthropic_messages": + try: + if agent._cleanup_dead_connections(): + agent._emit_status( + "🔌 Detected stale connections from a previous provider " + "issue — cleaned up automatically. Proceeding with fresh " + "connection." + ) + except Exception: + pass + # Replay compression warning through status_callback for gateway + # platforms (the callback was not wired during __init__). + if agent._compression_warning: + agent._replay_compression_warning() + agent._compression_warning = None # send once + + # NOTE: _turns_since_memory and _iters_since_skill are NOT reset here. + # They are initialized in __init__ and must persist across run_conversation + # calls so that nudge logic accumulates correctly in CLI mode. + agent.iteration_budget = IterationBudget(agent.max_iterations) + + # Log conversation turn start for debugging/observability + _preview_text = _summarize_user_message_for_log(user_message) + _msg_preview = (_preview_text[:80] + "...") if len(_preview_text) > 80 else _preview_text + _msg_preview = _msg_preview.replace("\n", " ") + logger.info( + "conversation turn: session=%s model=%s provider=%s platform=%s history=%d msg=%r", + agent.session_id or "none", agent.model, agent.provider or "unknown", + agent.platform or "unknown", len(conversation_history or []), + _msg_preview, + ) + + # Initialize conversation (copy to avoid mutating the caller's list) + messages = list(conversation_history) if conversation_history else [] + + # Hydrate todo store from conversation history (gateway creates a fresh + # AIAgent per message, so the in-memory store is empty -- we need to + # recover the todo state from the most recent todo tool response in history) + if conversation_history and not agent._todo_store.has_items(): + agent._hydrate_todo_store(conversation_history) + + # Hydrate per-session nudge counters from persisted history. + # Gateway creates a fresh AIAgent per inbound message (cache miss / + # 1h idle eviction / config-signature mismatch / process restart), so + # _turns_since_memory and _user_turn_count start at 0 every turn and + # the memory.nudge_interval trigger may never be reached. Reconstruct + # an effective count from prior user turns in conversation_history. + # Idempotent: a cached agent that already accumulated counters keeps + # them; only a freshly-built agent with empty in-memory state hydrates. + # See issue #22357. + if conversation_history and agent._user_turn_count == 0: + prior_user_turns = sum( + 1 for m in conversation_history if m.get("role") == "user" + ) + if prior_user_turns > 0: + agent._user_turn_count = prior_user_turns + if agent._memory_nudge_interval > 0 and agent._turns_since_memory == 0: + # % preserves original 1-in-N cadence rather than firing a + # review immediately on resume (which would surprise users + # whose session happened to land just past a multiple of N). + agent._turns_since_memory = prior_user_turns % agent._memory_nudge_interval + + + # Prefill messages (few-shot priming) are injected at API-call time only, + # never stored in the messages list. This keeps them ephemeral: they won't + # be saved to session DB, session logs, or batch trajectories, but they're + # automatically re-applied on every API call (including session continuations). + + # Track user turns for memory flush and periodic nudge logic + agent._user_turn_count += 1 + + # Reset the streaming context scrubber at the top of each turn so a + # hung span from a prior interrupted stream can't taint this turn's + # output. + scrubber = getattr(agent, "_stream_context_scrubber", None) + if scrubber is not None: + scrubber.reset() + # Reset the think scrubber for the same reason — an interrupted + # prior stream may have left us inside an unterminated block. + think_scrubber = getattr(agent, "_stream_think_scrubber", None) + if think_scrubber is not None: + think_scrubber.reset() + + # Preserve the original user message (no nudge injection). + original_user_message = persist_user_message if persist_user_message is not None else user_message + + # Track memory nudge trigger (turn-based, checked here). + # Skill trigger is checked AFTER the agent loop completes, based on + # how many tool iterations THIS turn used. + _should_review_memory = False + if (agent._memory_nudge_interval > 0 + and "memory" in agent.valid_tool_names + and agent._memory_store): + agent._turns_since_memory += 1 + if agent._turns_since_memory >= agent._memory_nudge_interval: + _should_review_memory = True + agent._turns_since_memory = 0 + + # Add user message + user_msg = {"role": "user", "content": user_message} + messages.append(user_msg) + current_turn_user_idx = len(messages) - 1 + agent._persist_user_message_idx = current_turn_user_idx + + if not agent.quiet_mode: + _print_preview = _summarize_user_message_for_log(user_message) + agent._safe_print(f"💬 Starting conversation: '{_print_preview[:60]}{'...' if len(_print_preview) > 60 else ''}'") + + # ── System prompt (cached per session for prefix caching) ── + # Built once on first call, reused for all subsequent calls. + # Only rebuilt after context compression events (which invalidate + # the cache and reload memory from disk). + # + # For continuing sessions (gateway creates a fresh AIAgent per + # message), we load the stored system prompt from the session DB + # instead of rebuilding. Rebuilding would pick up memory changes + # from disk that the model already knows about (it wrote them!), + # producing a different system prompt and breaking the Anthropic + # prefix cache. + if agent._cached_system_prompt is None: + stored_prompt = None + if conversation_history and agent._session_db: + try: + session_row = agent._session_db.get_session(agent.session_id) + if session_row: + stored_prompt = session_row.get("system_prompt") or None + except Exception: + pass # Fall through to build fresh + + if stored_prompt: + # Continuing session — reuse the exact system prompt from + # the previous turn so the Anthropic cache prefix matches. + agent._cached_system_prompt = stored_prompt + else: + # First turn of a new session — build from scratch. + agent._cached_system_prompt = agent._build_system_prompt(system_message) + # Plugin hook: on_session_start + # Fired once when a brand-new session is created (not on + # continuation). Plugins can use this to initialise + # session-scoped state (e.g. warm a memory cache). + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _invoke_hook( + "on_session_start", + session_id=agent.session_id, + model=agent.model, + platform=getattr(agent, "platform", None) or "", + ) + except Exception as exc: + logger.warning("on_session_start hook failed: %s", exc) + + # Store the system prompt snapshot in SQLite + if agent._session_db: + try: + agent._session_db.update_system_prompt(agent.session_id, agent._cached_system_prompt) + except Exception as e: + logger.debug("Session DB update_system_prompt failed: %s", e) + + active_system_prompt = agent._cached_system_prompt + + # ── Preflight context compression ── + # Before entering the main loop, check if the loaded conversation + # history already exceeds the model's context threshold. This handles + # cases where a user switches to a model with a smaller context window + # while having a large existing session — compress proactively rather + # than waiting for an API error (which might be caught as a non-retryable + # 4xx and abort the request entirely). + if ( + agent.compression_enabled + and len(messages) > agent.context_compressor.protect_first_n + + agent.context_compressor.protect_last_n + 1 + ): + # Include tool schema tokens — with many tools these can add + # 20-30K+ tokens that the old sys+msg estimate missed entirely. + _preflight_tokens = estimate_request_tokens_rough( + messages, + system_prompt=active_system_prompt or "", + tools=agent.tools or None, + ) + + if _preflight_tokens >= agent.context_compressor.threshold_tokens: + logger.info( + "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)", + f"{_preflight_tokens:,}", + f"{agent.context_compressor.threshold_tokens:,}", + agent.model, + f"{agent.context_compressor.context_length:,}", + ) + agent._emit_status( + f"📦 Preflight compression: ~{_preflight_tokens:,} tokens " + f">= {agent.context_compressor.threshold_tokens:,} threshold. " + "This may take a moment." + ) + # May need multiple passes for very large sessions with small + # context windows (each pass summarises the middle N turns). + for _pass in range(3): + _orig_len = len(messages) + messages, active_system_prompt = agent._compress_context( + messages, system_message, approx_tokens=_preflight_tokens, + task_id=effective_task_id, + ) + if len(messages) >= _orig_len: + break # Cannot compress further + # Compression created a new session — clear the history + # reference so _flush_messages_to_session_db writes ALL + # compressed messages to the new session's SQLite, not + # skipping them because conversation_history is still the + # pre-compression length. + conversation_history = None + # Fix: reset retry counters after compression so the model + # gets a fresh budget on the compressed context. Without + # this, pre-compression retries carry over and the model + # hits "(empty)" immediately after compression-induced + # context loss. + agent._empty_content_retries = 0 + agent._thinking_prefill_retries = 0 + agent._last_content_with_tools = None + agent._last_content_tools_all_housekeeping = False + agent._mute_post_response = False + # Re-estimate after compression + _preflight_tokens = estimate_request_tokens_rough( + messages, + system_prompt=active_system_prompt or "", + tools=agent.tools or None, + ) + if _preflight_tokens < agent.context_compressor.threshold_tokens: + break # Under threshold + + # Plugin hook: pre_llm_call + # Fired once per turn before the tool-calling loop. Plugins can + # return a dict with a ``context`` key (or a plain string) whose + # value is appended to the current turn's user message. + # + # Context is ALWAYS injected into the user message, never the + # system prompt. This preserves the prompt cache prefix — the + # system prompt stays identical across turns so cached tokens + # are reused. The system prompt is Hermes's territory; plugins + # contribute context alongside the user's input. + # + # All injected context is ephemeral (not persisted to session DB). + _plugin_user_context = "" + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _pre_results = _invoke_hook( + "pre_llm_call", + session_id=agent.session_id, + user_message=original_user_message, + conversation_history=list(messages), + is_first_turn=(not bool(conversation_history)), + model=agent.model, + platform=getattr(agent, "platform", None) or "", + sender_id=getattr(agent, "_user_id", None) or "", + ) + _ctx_parts: list[str] = [] + for r in _pre_results: + if isinstance(r, dict) and r.get("context"): + _ctx_parts.append(str(r["context"])) + elif isinstance(r, str) and r.strip(): + _ctx_parts.append(r) + if _ctx_parts: + _plugin_user_context = "\n\n".join(_ctx_parts) + except Exception as exc: + logger.warning("pre_llm_call hook failed: %s", exc) + + # Main conversation loop + api_call_count = 0 + final_response = None + interrupted = False + codex_ack_continuations = 0 + length_continue_retries = 0 + truncated_tool_call_retries = 0 + truncated_response_parts: List[str] = [] + compression_attempts = 0 + _turn_exit_reason = "unknown" # Diagnostic: why the loop ended + + # Per-turn file-mutation verifier state. Keyed by resolved path; + # each failed ``write_file`` / ``patch`` call records the error + # preview. Later successful writes to the same path remove the + # entry (the model recovered). At end-of-turn, any entries still + # present are surfaced in an advisory footer so the model cannot + # over-claim success while the file is actually unchanged on disk. + agent._turn_failed_file_mutations: Dict[str, Dict[str, Any]] = {} + + # Record the execution thread so interrupt()/clear_interrupt() can + # scope the tool-level interrupt signal to THIS agent's thread only. + # Must be set before any thread-scoped interrupt syncing. + agent._execution_thread_id = threading.current_thread().ident + + # Always clear stale per-thread state from a previous turn. If an + # interrupt arrived before startup finished, preserve it and bind it + # to this execution thread now instead of dropping it on the floor. + _ra()._set_interrupt(False, agent._execution_thread_id) + if agent._interrupt_requested: + _ra()._set_interrupt(True, agent._execution_thread_id) + agent._interrupt_thread_signal_pending = False + else: + agent._interrupt_message = None + agent._interrupt_thread_signal_pending = False + + # Notify memory providers of the new turn so cadence tracking works. + # Must happen BEFORE prefetch_all() so providers know which turn it is + # and can gate context/dialectic refresh via contextCadence/dialecticCadence. + if agent._memory_manager: + try: + _turn_msg = original_user_message if isinstance(original_user_message, str) else "" + agent._memory_manager.on_turn_start(agent._user_turn_count, _turn_msg) + except Exception: + pass + + # External memory provider: prefetch once before the tool loop. + # Reuse the cached result on every iteration to avoid re-calling + # prefetch_all() on each tool call (10 tool calls = 10x latency + cost). + # Use original_user_message (clean input) — user_message may contain + # injected skill content that bloats / breaks provider queries. + _ext_prefetch_cache = "" + if agent._memory_manager: + try: + _query = original_user_message if isinstance(original_user_message, str) else "" + _ext_prefetch_cache = agent._memory_manager.prefetch_all(_query) or "" + except Exception: + pass + + # Optional opt-in runtime: if api_mode == codex_app_server, hand the + # turn to the codex app-server subprocess (terminal/file ops/patching + # all run inside Codex). Default Hermes path is bypassed entirely. + # See agent/transports/codex_app_server_session.py for the adapter + # and references/codex-app-server-runtime.md for the rationale. + if agent.api_mode == "codex_app_server": + return agent._run_codex_app_server_turn( + user_message=user_message, + original_user_message=original_user_message, + messages=messages, + effective_task_id=effective_task_id, + should_review_memory=_should_review_memory, + ) + + while (api_call_count < agent.max_iterations and agent.iteration_budget.remaining > 0) or agent._budget_grace_call: + # Reset per-turn checkpoint dedup so each iteration can take one snapshot + agent._checkpoint_mgr.new_turn() + + # Check for interrupt request (e.g., user sent new message) + if agent._interrupt_requested: + interrupted = True + _turn_exit_reason = "interrupted_by_user" + if not agent.quiet_mode: + agent._safe_print("\n⚡ Breaking out of tool loop due to interrupt...") + break + + api_call_count += 1 + agent._api_call_count = api_call_count + agent._touch_activity(f"starting API call #{api_call_count}") + + # Grace call: the budget is exhausted but we gave the model one + # more chance. Consume the grace flag so the loop exits after + # this iteration regardless of outcome. + if agent._budget_grace_call: + agent._budget_grace_call = False + elif not agent.iteration_budget.consume(): + _turn_exit_reason = "budget_exhausted" + if not agent.quiet_mode: + agent._safe_print(f"\n⚠️ Iteration budget exhausted ({agent.iteration_budget.used}/{agent.iteration_budget.max_total} iterations used)") + break + + # Fire step_callback for gateway hooks (agent:step event) + if agent.step_callback is not None: + try: + prev_tools = [] + for _idx, _m in enumerate(reversed(messages)): + if _m.get("role") == "assistant" and _m.get("tool_calls"): + _fwd_start = len(messages) - _idx + _results_by_id = {} + for _tm in messages[_fwd_start:]: + if _tm.get("role") != "tool": + break + _tcid = _tm.get("tool_call_id") + if _tcid: + _results_by_id[_tcid] = _tm.get("content", "") + prev_tools = [ + { + "name": tc["function"]["name"], + "result": _results_by_id.get(tc.get("id")), + "arguments": tc["function"].get("arguments"), + } + for tc in _m["tool_calls"] + if isinstance(tc, dict) + ] + break + agent.step_callback(api_call_count, prev_tools) + except Exception as _step_err: + logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err) + + # Track tool-calling iterations for skill nudge. + # Counter resets whenever skill_manage is actually used. + if (agent._skill_nudge_interval > 0 + and "skill_manage" in agent.valid_tool_names): + agent._iters_since_skill += 1 + + # ── Pre-API-call /steer drain ────────────────────────────────── + # If a /steer arrived during the previous API call (while the model + # was thinking), drain it now — before we build api_messages — so + # the model sees the steer text on THIS iteration. Without this, + # steers sent during an API call only land after the NEXT tool batch, + # which may never come if the model returns a final response. + # + # We scan backwards for the last tool-role message in the messages + # list. If found, the steer is appended there. If not (first + # iteration, no tools yet), the steer stays pending for the next + # tool batch — injecting into a user message would break role + # alternation, and there's no tool output to piggyback on. + _pre_api_steer = agent._drain_pending_steer() + if _pre_api_steer: + _injected = False + for _si in range(len(messages) - 1, -1, -1): + _sm = messages[_si] + if isinstance(_sm, dict) and _sm.get("role") == "tool": + marker = f"\n\nUser guidance: {_pre_api_steer}" + existing = _sm.get("content", "") + if isinstance(existing, str): + _sm["content"] = existing + marker + else: + # Multimodal content blocks — append text block + try: + blocks = list(existing) if existing else [] + blocks.append({"type": "text", "text": marker}) + _sm["content"] = blocks + except Exception: + pass + _injected = True + logger.debug( + "Pre-API-call steer drain: injected into tool msg at index %d", + _si, + ) + break + if not _injected: + # No tool message to inject into — put it back so + # the post-tool-execution drain picks it up later. + _lock = getattr(agent, "_pending_steer_lock", None) + if _lock is not None: + with _lock: + if agent._pending_steer: + agent._pending_steer = agent._pending_steer + "\n" + _pre_api_steer + else: + agent._pending_steer = _pre_api_steer + else: + existing = getattr(agent, "_pending_steer", None) + agent._pending_steer = (existing + "\n" + _pre_api_steer) if existing else _pre_api_steer + + # Prepare messages for API call + # If we have an ephemeral system prompt, prepend it to the messages + # Note: Reasoning is embedded in content via tags for trajectory storage. + # However, providers like Moonshot AI require a separate 'reasoning_content' field + # on assistant messages with tool_calls. We handle both cases here. + request_logger = getattr(agent, "logger", None) or logging.getLogger(__name__) + repaired_tool_calls = agent._sanitize_tool_call_arguments( + messages, + logger=request_logger, + session_id=agent.session_id, + ) + if repaired_tool_calls > 0: + request_logger.info( + "Sanitized %s corrupted tool_call arguments before request (session=%s)", + repaired_tool_calls, + agent.session_id or "-", + ) + + # Defensive: repair malformed role-alternation before API call. + # Catches cases where the history got wedged into a + # ``tool → user`` or ``user → user`` tail (e.g. after empty- + # response scaffolding was stripped and a new user message + # landed after an orphan tool result). Most providers return + # empty content on malformed sequences, which would otherwise + # retrigger the empty-retry loop indefinitely. + repaired_seq = agent._repair_message_sequence(messages) + if repaired_seq > 0: + request_logger.info( + "Repaired %s message-alternation violations before request (session=%s)", + repaired_seq, + agent.session_id or "-", + ) + + api_messages = [] + for idx, msg in enumerate(messages): + api_msg = msg.copy() + + # Inject ephemeral context into the current turn's user message. + # Sources: memory manager prefetch + plugin pre_llm_call hooks + # with target="user_message" (the default). Both are + # API-call-time only — the original message in `messages` is + # never mutated, so nothing leaks into session persistence. + if idx == current_turn_user_idx and msg.get("role") == "user": + _injections = [] + if _ext_prefetch_cache: + _fenced = build_memory_context_block(_ext_prefetch_cache) + if _fenced: + _injections.append(_fenced) + if _plugin_user_context: + _injections.append(_plugin_user_context) + if _injections: + _base = api_msg.get("content", "") + if isinstance(_base, str): + api_msg["content"] = _base + "\n\n" + "\n\n".join(_injections) + + # For ALL assistant messages, pass reasoning back to the API + # This ensures multi-turn reasoning context is preserved + agent._copy_reasoning_content_for_api(msg, api_msg) + + # Remove 'reasoning' field - it's for trajectory storage only + # We've copied it to 'reasoning_content' for the API above + if "reasoning" in api_msg: + api_msg.pop("reasoning") + # Remove finish_reason - not accepted by strict APIs (e.g. Mistral) + if "finish_reason" in api_msg: + api_msg.pop("finish_reason") + # Strip internal thinking-prefill marker + api_msg.pop("_thinking_prefill", None) + # Strip Codex Responses API fields (call_id, response_item_id) for + # strict providers like Mistral, Fireworks, etc. that reject unknown fields. + # Uses new dicts so the internal messages list retains the fields + # for Codex Responses compatibility. + if agent._should_sanitize_tool_calls(): + agent._sanitize_tool_calls_for_strict_api(api_msg) + # Keep 'reasoning_details' - OpenRouter uses this for multi-turn reasoning context + # The signature field helps maintain reasoning continuity + api_messages.append(api_msg) + + # Build the final system message: cached prompt + ephemeral system prompt. + # Ephemeral additions are API-call-time only (not persisted to session DB). + # External recall context is injected into the user message, not the system + # prompt, so the stable cache prefix remains unchanged. + # + # NOTE: Plugin context from pre_llm_call hooks is injected into the + # user message (see injection block above), NOT the system prompt. + # This is intentional — system prompt modifications break the prompt + # cache prefix. The system prompt is reserved for Hermes internals. + # + # Hermes invariant: the system prompt is built ONCE per session + # (cached on ``_cached_system_prompt``) and replayed verbatim on + # every turn. We send it as a single content string so the + # bytes are byte-stable across turns and upstream prompt caches + # stay warm. + effective_system = active_system_prompt or "" + if agent.ephemeral_system_prompt: + effective_system = (effective_system + "\n\n" + agent.ephemeral_system_prompt).strip() + if effective_system: + api_messages = [{"role": "system", "content": effective_system}] + api_messages + + # Inject ephemeral prefill messages right after the system prompt + # but before conversation history. Same API-call-time-only pattern. + if agent.prefill_messages: + sys_offset = 1 if (api_messages and api_messages[0].get("role") == "system") else 0 + for idx, pfm in enumerate(agent.prefill_messages): + api_messages.insert(sys_offset + idx, pfm.copy()) + + # Apply Anthropic prompt caching for Claude models on native + # Anthropic, OpenRouter, and third-party Anthropic-compatible + # gateways. Auto-detected: if ``_use_prompt_caching`` is set, + # inject cache_control breakpoints (system + last 3 messages) + # to reduce input token costs by ~75% on multi-turn + # conversations. + if agent._use_prompt_caching: + api_messages = apply_anthropic_cache_control( + api_messages, + cache_ttl=agent._cache_ttl, + native_anthropic=agent._use_native_cache_layout, + ) + + # Safety net: strip orphaned tool results / add stubs for missing + # results before sending to the API. Runs unconditionally — not + # gated on context_compressor — so orphans from session loading or + # manual message manipulation are always caught. + api_messages = agent._sanitize_api_messages(api_messages) + + # Drop thinking-only assistant turns (reasoning but no visible + # output and no tool_calls) and merge any adjacent user messages + # left behind. Prevents Anthropic 400s ("The final block in an + # assistant message cannot be `thinking`.") and equivalent errors + # from third-party Anthropic-compatible gateways that can't replay + # a thinking-only turn. Runs on the per-call copy only — the + # stored conversation history keeps the reasoning block for the + # UI transcript and session persistence. + api_messages = agent._drop_thinking_only_and_merge_users(api_messages) + + # Normalize message whitespace and tool-call JSON for consistent + # prefix matching. Ensures bit-perfect prefixes across turns, + # which enables KV cache reuse on local inference servers + # (llama.cpp, vLLM, Ollama) and improves cache hit rates for + # cloud providers. Operates on api_messages (the API copy) so + # the original conversation history in `messages` is untouched. + for am in api_messages: + if isinstance(am.get("content"), str): + am["content"] = am["content"].strip() + for am in api_messages: + tcs = am.get("tool_calls") + if not tcs: + continue + new_tcs = [] + for tc in tcs: + if isinstance(tc, dict) and "function" in tc: + try: + args_obj = json.loads(tc["function"]["arguments"]) + tc = {**tc, "function": { + **tc["function"], + "arguments": json.dumps( + args_obj, separators=(",", ":"), + sort_keys=True, + ), + }} + except Exception: + tc["function"]["arguments"] = _repair_tool_call_arguments( + tc["function"]["arguments"], + tc["function"].get("name", "?"), + ) + new_tcs.append(tc) + am["tool_calls"] = new_tcs + + # Proactively strip any surrogate characters before the API call. + # Models served via Ollama (Kimi K2.5, GLM-5, Qwen) can return + # lone surrogates (U+D800-U+DFFF) that crash json.dumps() inside + # the OpenAI SDK. Sanitizing here prevents the 3-retry cycle. + _sanitize_messages_surrogates(api_messages) + + # Calculate approximate request size for logging + total_chars = sum(len(str(msg)) for msg in api_messages) + approx_tokens = estimate_messages_tokens_rough(api_messages) + + # Thinking spinner for quiet mode (animated during API call) + thinking_spinner = None + + if not agent.quiet_mode: + agent._vprint(f"\n{agent.log_prefix}🔄 Making API call #{api_call_count}/{agent.max_iterations}...") + agent._vprint(f"{agent.log_prefix} 📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)") + agent._vprint(f"{agent.log_prefix} 🔧 Available tools: {len(agent.tools) if agent.tools else 0}") + else: + # Animated thinking spinner in quiet mode + face = random.choice(KawaiiSpinner.get_thinking_faces()) + verb = random.choice(KawaiiSpinner.get_thinking_verbs()) + if agent.thinking_callback: + # CLI TUI mode: use prompt_toolkit widget instead of raw spinner + # (works in both streaming and non-streaming modes) + agent.thinking_callback(f"{face} {verb}...") + elif not agent._has_stream_consumers() and agent._should_start_quiet_spinner(): + # Raw KawaiiSpinner only when no streaming consumers and the + # spinner output has a safe sink. + spinner_type = random.choice(['brain', 'sparkle', 'pulse', 'moon', 'star']) + thinking_spinner = KawaiiSpinner(f"{face} {verb}...", spinner_type=spinner_type, print_fn=agent._print_fn) + thinking_spinner.start() + + # Log request details if verbose + if agent.verbose_logging: + logging.debug(f"API Request - Model: {agent.model}, Messages: {len(messages)}, Tools: {len(agent.tools) if agent.tools else 0}") + logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}") + logging.debug(f"Total message size: ~{approx_tokens:,} tokens") + + api_start_time = time.time() + retry_count = 0 + max_retries = agent._api_max_retries + primary_recovery_attempted = False + max_compression_attempts = 3 + codex_auth_retry_attempted=False + anthropic_auth_retry_attempted=False + nous_auth_retry_attempted=False + copilot_auth_retry_attempted=False + thinking_sig_retry_attempted = False + image_shrink_retry_attempted = False + oauth_1m_beta_retry_attempted = False + llama_cpp_grammar_retry_attempted = False + has_retried_429 = False + restart_with_compressed_messages = False + restart_with_length_continuation = False + + finish_reason = "stop" + response = None # Guard against UnboundLocalError if all retries fail + api_kwargs = None # Guard against UnboundLocalError in except handler + + while retry_count < max_retries: + # ── Nous Portal rate limit guard ────────────────────── + # If another session already recorded that Nous is rate- + # limited, skip the API call entirely. Each attempt + # (including SDK-level retries) counts against RPH and + # deepens the rate limit hole. + if agent.provider == "nous": + try: + from agent.nous_rate_guard import ( + nous_rate_limit_remaining, + format_remaining as _fmt_nous_remaining, + ) + _nous_remaining = nous_rate_limit_remaining() + if _nous_remaining is not None and _nous_remaining > 0: + _nous_msg = ( + f"Nous Portal rate limit active — " + f"resets in {_fmt_nous_remaining(_nous_remaining)}." + ) + agent._vprint( + f"{agent.log_prefix}⏳ {_nous_msg} Trying fallback...", + force=True, + ) + agent._emit_status(f"⏳ {_nous_msg}") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + # No fallback available — return with clear message + agent._persist_session(messages, conversation_history) + return { + "final_response": ( + f"⏳ {_nous_msg}\n\n" + "No fallback provider available. " + "Try again after the reset, or add a " + "fallback provider in config.yaml." + ), + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": _nous_msg, + } + except ImportError: + pass + except Exception: + pass # Never let rate guard break the agent loop + + try: + agent._reset_stream_delivery_tracking() + api_kwargs = agent._build_api_kwargs(api_messages) + if agent._force_ascii_payload: + _sanitize_structure_non_ascii(api_kwargs) + if agent.api_mode == "codex_responses": + api_kwargs = agent._get_transport().preflight_kwargs(api_kwargs, allow_stream=False) + + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + request_messages = api_kwargs.get("messages") + if not isinstance(request_messages, list): + request_messages = api_kwargs.get("input") + if not isinstance(request_messages, list): + request_messages = api_messages + # Shallow-copy the outer list so plugins that retain the + # reference for async snapshotting don't observe later + # mutations of api_messages. The inner dicts are not + # mutated by the agent loop, so a shallow copy is + # sufficient; a deepcopy would walk every tool result + # and base64 image on every API call. + _invoke_hook( + "pre_api_request", + task_id=effective_task_id, + session_id=agent.session_id or "", + user_message=original_user_message, + conversation_history=list(messages), + platform=agent.platform or "", + model=agent.model, + provider=agent.provider, + base_url=agent.base_url, + api_mode=agent.api_mode, + api_call_count=api_call_count, + request_messages=list(request_messages) if isinstance(request_messages, list) else [], + message_count=len(api_messages), + tool_count=len(agent.tools or []), + approx_input_tokens=approx_tokens, + request_char_count=total_chars, + max_tokens=agent.max_tokens, + ) + except Exception: + pass + + if env_var_enabled("HERMES_DUMP_REQUESTS"): + agent._dump_api_request_debug(api_kwargs, reason="preflight") + + # Always prefer the streaming path — even without stream + # consumers. Streaming gives us fine-grained health + # checking (90s stale-stream detection, 60s read timeout) + # that the non-streaming path lacks. Without this, + # subagents and other quiet-mode callers can hang + # indefinitely when the provider keeps the connection + # alive with SSE pings but never delivers a response. + # The streaming path is a no-op for callbacks when no + # consumers are registered, and falls back to non- + # streaming automatically if the provider doesn't + # support it. + def _stop_spinner(): + nonlocal thinking_spinner + if thinking_spinner: + thinking_spinner.stop("") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + + _use_streaming = True + # Provider signaled "stream not supported" on a previous + # attempt — switch to non-streaming for the rest of this + # session instead of re-failing every retry. + if getattr(agent, "_disable_streaming", False): + _use_streaming = False + # CopilotACPClient communicates via subprocess stdio and + # returns a plain SimpleNamespace — not an iterable + # stream. Mirror the ACP exclusion used for Responses + # API upgrade (lines ~1083-1085). + elif ( + agent.provider == "copilot-acp" + or str(agent.base_url or "").lower().startswith("acp://copilot") + or str(agent.base_url or "").lower().startswith("acp+tcp://") + ): + _use_streaming = False + elif not agent._has_stream_consumers(): + # No display/TTS consumer. Still prefer streaming for + # health checking, but skip for Mock clients in tests + # (mocks return SimpleNamespace, not stream iterators). + from unittest.mock import Mock + if isinstance(getattr(agent, "client", None), Mock): + _use_streaming = False + + if _use_streaming: + response = agent._interruptible_streaming_api_call( + api_kwargs, on_first_delta=_stop_spinner + ) + else: + response = agent._interruptible_api_call(api_kwargs) + + api_duration = time.time() - api_start_time + + # Stop thinking spinner silently -- the response box or tool + # execution messages that follow are more informative. + if thinking_spinner: + thinking_spinner.stop("") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + + if not agent.quiet_mode: + agent._vprint(f"{agent.log_prefix}⏱️ API call completed in {api_duration:.2f}s") + + if agent.verbose_logging: + # Log response with provider info if available + resp_model = getattr(response, 'model', 'N/A') if response else 'N/A' + logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}") + + # Validate response shape before proceeding + response_invalid = False + error_details = [] + if agent.api_mode == "codex_responses": + _ct_v = agent._get_transport() + if not _ct_v.validate_response(response): + if response is None: + response_invalid = True + error_details.append("response is None") + else: + # Provider returned a terminal failure (e.g. quota exhaustion). + # Treat as invalid so the fallback chain is triggered instead of + # letting the error bubble up outside the retry/fallback loop. + _codex_resp_status = str(getattr(response, "status", "") or "").strip().lower() + if _codex_resp_status in {"failed", "cancelled"}: + _codex_error_obj = getattr(response, "error", None) + _codex_error_msg = ( + _codex_error_obj.get("message") if isinstance(_codex_error_obj, dict) + else str(_codex_error_obj) if _codex_error_obj + else f"Responses API returned status '{_codex_resp_status}'" + ) + logging.warning( + "Codex response status='%s' (error=%s). Routing to fallback. %s", + _codex_resp_status, _codex_error_msg, + agent._client_log_context(), + ) + response_invalid = True + error_details.append(f"response.status={_codex_resp_status}: {_codex_error_msg}") + else: + # output_text fallback: stream backfill may have failed + # but normalize can still recover from output_text + _out_text = getattr(response, "output_text", None) + _out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else "" + if _out_text_stripped: + logger.debug( + "Codex response.output is empty but output_text is present " + "(%d chars); deferring to normalization.", + len(_out_text_stripped), + ) + else: + _resp_status = getattr(response, "status", None) + _resp_incomplete = getattr(response, "incomplete_details", None) + logger.warning( + "Codex response.output is empty after stream backfill " + "(status=%s, incomplete_details=%s, model=%s). %s", + _resp_status, _resp_incomplete, + getattr(response, "model", None), + f"api_mode={agent.api_mode} provider={agent.provider}", + ) + response_invalid = True + error_details.append("response.output is empty") + elif agent.api_mode == "anthropic_messages": + _tv = agent._get_transport() + if not _tv.validate_response(response): + response_invalid = True + if response is None: + error_details.append("response is None") + else: + error_details.append("response.content invalid (not a non-empty list)") + elif agent.api_mode == "bedrock_converse": + _btv = agent._get_transport() + if not _btv.validate_response(response): + response_invalid = True + if response is None: + error_details.append("response is None") + else: + error_details.append("Bedrock response invalid (no output or choices)") + else: + _ctv = agent._get_transport() + if not _ctv.validate_response(response): + response_invalid = True + if response is None: + error_details.append("response is None") + elif not hasattr(response, 'choices'): + error_details.append("response has no 'choices' attribute") + elif response.choices is None: + error_details.append("response.choices is None") + else: + error_details.append("response.choices is empty") + + if response_invalid: + # Stop spinner before printing error messages + if thinking_spinner: + thinking_spinner.stop("(´;ω;`) oops, retrying...") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + + # Invalid response — could be rate limiting, provider timeout, + # upstream server error, or malformed response. + retry_count += 1 + + # Eager fallback: empty/malformed responses are a common + # rate-limit symptom. Switch to fallback immediately + # rather than retrying with extended backoff. + if agent._fallback_index < len(agent._fallback_chain): + agent._emit_status("⚠️ Empty/malformed response — switching to fallback...") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + + # Check for error field in response (some providers include this) + error_msg = "Unknown" + provider_name = "Unknown" + if response and hasattr(response, 'error') and response.error: + error_msg = str(response.error) + # Try to extract provider from error metadata + if hasattr(response.error, 'metadata') and response.error.metadata: + provider_name = response.error.metadata.get('provider_name', 'Unknown') + elif response and hasattr(response, 'message') and response.message: + error_msg = str(response.message) + + # Try to get provider from model field (OpenRouter often returns actual model used) + if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model: + provider_name = f"model={response.model}" + + # Check for x-openrouter-provider or similar metadata + if provider_name == "Unknown" and response: + # Log all response attributes for debugging + resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')} + if agent.verbose_logging: + logging.debug(f"Response attributes for invalid response: {resp_attrs}") + + # Extract error code from response for contextual diagnostics + _resp_error_code = None + if response and hasattr(response, 'error') and response.error: + _code_raw = getattr(response.error, 'code', None) + if _code_raw is None and isinstance(response.error, dict): + _code_raw = response.error.get('code') + if _code_raw is not None: + try: + _resp_error_code = int(_code_raw) + except (TypeError, ValueError): + pass + + # Build a human-readable failure hint from the error code + # and response time, instead of always assuming rate limiting. + if _resp_error_code == 524: + _failure_hint = f"upstream provider timed out (Cloudflare 524, {api_duration:.0f}s)" + elif _resp_error_code == 504: + _failure_hint = f"upstream gateway timeout (504, {api_duration:.0f}s)" + elif _resp_error_code == 429: + _failure_hint = f"rate limited by upstream provider (429)" + elif _resp_error_code in {500, 502}: + _failure_hint = f"upstream server error ({_resp_error_code}, {api_duration:.0f}s)" + elif _resp_error_code in {503, 529}: + _failure_hint = f"upstream provider overloaded ({_resp_error_code})" + elif _resp_error_code is not None: + _failure_hint = f"upstream error (code {_resp_error_code}, {api_duration:.0f}s)" + elif api_duration < 10: + _failure_hint = f"fast response ({api_duration:.1f}s) — likely rate limited" + elif api_duration > 60: + _failure_hint = f"slow response ({api_duration:.0f}s) — likely upstream timeout" + else: + _failure_hint = f"response time {api_duration:.1f}s" + + agent._vprint(f"{agent.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True) + agent._vprint(f"{agent.log_prefix} 🏢 Provider: {provider_name}", force=True) + cleaned_provider_error = agent._clean_error_message(error_msg) + agent._vprint(f"{agent.log_prefix} 📝 Provider message: {cleaned_provider_error}", force=True) + agent._vprint(f"{agent.log_prefix} ⏱️ {_failure_hint}", force=True) + + if retry_count >= max_retries: + # Try fallback before giving up + agent._emit_status(f"⚠️ Max retries ({max_retries}) for invalid responses — trying fallback...") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + agent._emit_status(f"❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.") + logging.error(f"{agent.log_prefix}Invalid API response after {max_retries} retries.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Invalid API response after {max_retries} retries: {_failure_hint}", + "failed": True # Mark as failure for filtering + } + + # Backoff before retry — jittered exponential: 5s base, 120s cap + wait_time = jittered_backoff(retry_count, base_delay=5.0, max_delay=120.0) + agent._vprint(f"{agent.log_prefix}⏳ Retrying in {wait_time:.1f}s ({_failure_hint})...", force=True) + logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}") + + # Sleep in small increments to stay responsive to interrupts + sleep_end = time.time() + wait_time + _backoff_touch_counter = 0 + while time.time() < sleep_end: + if agent._interrupt_requested: + agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True) + agent._persist_session(messages, conversation_history) + agent.clear_interrupt() + return { + "final_response": f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries}).", + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "interrupted": True, + } + time.sleep(0.2) + # Touch activity every ~30s so the gateway's inactivity + # monitor knows we're alive during backoff waits. + _backoff_touch_counter += 1 + if _backoff_touch_counter % 150 == 0: # 150 × 0.2s = 30s + agent._touch_activity( + f"retry backoff ({retry_count}/{max_retries}), " + f"{int(sleep_end - time.time())}s remaining" + ) + continue # Retry the API call + + # Check finish_reason before proceeding + if agent.api_mode == "codex_responses": + status = getattr(response, "status", None) + incomplete_details = getattr(response, "incomplete_details", None) + incomplete_reason = None + if isinstance(incomplete_details, dict): + incomplete_reason = incomplete_details.get("reason") + else: + incomplete_reason = getattr(incomplete_details, "reason", None) + if status == "incomplete" and incomplete_reason in {"max_output_tokens", "length"}: + finish_reason = "length" + else: + finish_reason = "stop" + elif agent.api_mode == "anthropic_messages": + _tfr = agent._get_transport() + finish_reason = _tfr.map_finish_reason(response.stop_reason) + elif agent.api_mode == "bedrock_converse": + # Bedrock response already normalized at dispatch — use transport + _bt_fr = agent._get_transport() + _bedrock_result = _bt_fr.normalize_response(response) + finish_reason = _bedrock_result.finish_reason + else: + _cc_fr = agent._get_transport() + _finish_result = _cc_fr.normalize_response(response) + finish_reason = _finish_result.finish_reason + assistant_message = _finish_result + if agent._should_treat_stop_as_truncated( + finish_reason, + assistant_message, + messages, + ): + agent._vprint( + f"{agent.log_prefix}⚠️ Treating suspicious Ollama/GLM stop response as truncated", + force=True, + ) + finish_reason = "length" + + if finish_reason == "length": + agent._vprint(f"{agent.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True) + + # Normalize the truncated response to a single OpenAI-style + # message shape so text-continuation and tool-call retry + # work uniformly across chat_completions, bedrock_converse, + # and anthropic_messages. For Anthropic we use the same + # adapter the agent loop already relies on so the rebuilt + # interim assistant message is byte-identical to what + # would have been appended in the non-truncated path. + _trunc_msg = None + _trunc_transport = agent._get_transport() + if agent.api_mode == "anthropic_messages": + _trunc_result = _trunc_transport.normalize_response( + response, strip_tool_prefix=agent._is_anthropic_oauth + ) + else: + _trunc_result = _trunc_transport.normalize_response(response) + _trunc_msg = _trunc_result + + _trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None + _trunc_has_tool_calls = bool(getattr(_trunc_msg, "tool_calls", None)) if _trunc_msg else False + + # ── Detect thinking-budget exhaustion ────────────── + # When the model spends ALL output tokens on reasoning + # and has none left for the response, continuation + # retries are pointless. Detect this early and give a + # targeted error instead of wasting 3 API calls. + # A response is "thinking exhausted" only when the model + # actually produced reasoning blocks but no visible text after + # them. Models that do not use tags (e.g. GLM-4.7 on + # NVIDIA Build, minimax) may return content=None or an empty + # string for unrelated reasons — treat those as normal + # truncations that deserve continuation retries, not as + # thinking-budget exhaustion. + _has_think_tags = bool( + _trunc_content and re.search( + r'<(?:think|thinking|reasoning|REASONING_SCRATCHPAD)[^>]*>', + _trunc_content, + re.IGNORECASE, + ) + ) + _thinking_exhausted = ( + not _trunc_has_tool_calls + and _has_think_tags + and ( + (_trunc_content is not None and not agent._has_content_after_think_block(_trunc_content)) + or _trunc_content is None + ) + ) + + if _thinking_exhausted: + _exhaust_error = ( + "Model used all output tokens on reasoning with none left " + "for the response. Try lowering reasoning effort or " + "increasing max_tokens." + ) + agent._vprint( + f"{agent.log_prefix}💭 Reasoning exhausted the output token budget — " + f"no visible response was produced.", + force=True, + ) + # Return a user-friendly message as the response so + # CLI (response box) and gateway (chat message) both + # display it naturally instead of a suppressed error. + _exhaust_response = ( + "⚠️ **Thinking Budget Exhausted**\n\n" + "The model used all its output tokens on reasoning " + "and had none left for the actual response.\n\n" + "To fix this:\n" + "→ Lower reasoning effort: `/thinkon low` or `/thinkon minimal`\n" + "→ Or switch to a larger/non-reasoning model with `/model`" + ) + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + return { + "final_response": _exhaust_response, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": _exhaust_error, + } + + if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}: + assistant_message = _trunc_msg + if assistant_message is not None and not _trunc_has_tool_calls: + length_continue_retries += 1 + interim_msg = agent._build_assistant_message(assistant_message, finish_reason) + messages.append(interim_msg) + if assistant_message.content: + truncated_response_parts.append(assistant_message.content) + + if length_continue_retries < 3: + agent._vprint( + f"{agent.log_prefix}↻ Requesting continuation " + f"({length_continue_retries}/3)..." + ) + continue_msg = { + "role": "user", + "content": ( + "[System: Your previous response was truncated by the output " + "length limit. Continue exactly where you left off. Do not " + "restart or repeat prior text. Finish the answer directly.]" + ), + } + messages.append(continue_msg) + agent._session_messages = messages + agent._save_session_log(messages) + restart_with_length_continuation = True + break + + partial_response = agent._strip_think_blocks("".join(truncated_response_parts)).strip() + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + return { + "final_response": partial_response or None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Response remained truncated after 3 continuation attempts", + } + + if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}: + assistant_message = _trunc_msg + if assistant_message is not None and _trunc_has_tool_calls: + if truncated_tool_call_retries < 1: + truncated_tool_call_retries += 1 + agent._vprint( + f"{agent.log_prefix}⚠️ Truncated tool call detected — retrying API call...", + force=True, + ) + # Don't append the broken response to messages; + # just re-run the same API call from the current + # message state, giving the model another chance. + continue + agent._vprint( + f"{agent.log_prefix}⚠️ Truncated tool call response detected again — refusing to execute incomplete tool arguments.", + force=True, + ) + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Response truncated due to output length limit", + } + + # If we have prior messages, roll back to last complete state + if len(messages) > 1: + agent._vprint(f"{agent.log_prefix} ⏪ Rolling back to last complete assistant turn") + rolled_back_messages = agent._get_messages_up_to_last_assistant(messages) + + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + + return { + "final_response": None, + "messages": rolled_back_messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Response truncated due to output length limit" + } + else: + # First message was truncated - mark as failed + agent._vprint(f"{agent.log_prefix}❌ First response truncated - cannot recover", force=True) + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": "First response truncated due to output length limit" + } + + # Track actual token usage from response for context management + if hasattr(response, 'usage') and response.usage: + canonical_usage = normalize_usage( + response.usage, + provider=agent.provider, + api_mode=agent.api_mode, + ) + prompt_tokens = canonical_usage.prompt_tokens + completion_tokens = canonical_usage.output_tokens + total_tokens = canonical_usage.total_tokens + usage_dict = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + } + agent.context_compressor.update_from_response(usage_dict) + + # Cache discovered context length after successful call. + # Only persist limits confirmed by the provider (parsed + # from the error message), not guessed probe tiers. + if getattr(agent.context_compressor, "_context_probed", False): + ctx = agent.context_compressor.context_length + if getattr(agent.context_compressor, "_context_probe_persistable", False): + save_context_length(agent.model, agent.base_url, ctx) + agent._safe_print(f"{agent.log_prefix}💾 Cached context length: {ctx:,} tokens for {agent.model}") + agent.context_compressor._context_probed = False + agent.context_compressor._context_probe_persistable = False + + agent.session_prompt_tokens += prompt_tokens + agent.session_completion_tokens += completion_tokens + agent.session_total_tokens += total_tokens + agent.session_api_calls += 1 + agent.session_input_tokens += canonical_usage.input_tokens + agent.session_output_tokens += canonical_usage.output_tokens + agent.session_cache_read_tokens += canonical_usage.cache_read_tokens + agent.session_cache_write_tokens += canonical_usage.cache_write_tokens + agent.session_reasoning_tokens += canonical_usage.reasoning_tokens + + # Log API call details for debugging/observability + _cache_pct = "" + if canonical_usage.cache_read_tokens and prompt_tokens: + _cache_pct = f" cache={canonical_usage.cache_read_tokens}/{prompt_tokens} ({100*canonical_usage.cache_read_tokens/prompt_tokens:.0f}%)" + logger.info( + "API call #%d: model=%s provider=%s in=%d out=%d total=%d latency=%.1fs%s", + agent.session_api_calls, agent.model, agent.provider or "unknown", + prompt_tokens, completion_tokens, total_tokens, + api_duration, _cache_pct, + ) + + cost_result = estimate_usage_cost( + agent.model, + canonical_usage, + provider=agent.provider, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + ) + if cost_result.amount_usd is not None: + agent.session_estimated_cost_usd += float(cost_result.amount_usd) + agent.session_cost_status = cost_result.status + agent.session_cost_source = cost_result.source + + # Persist token counts to session DB for /insights. + # Do this for every platform with a session_id so non-CLI + # sessions (gateway, cron, delegated runs) cannot lose + # token/accounting data if a higher-level persistence path + # is skipped or fails. Gateway/session-store writes use + # absolute totals, so they safely overwrite these per-call + # deltas instead of double-counting them. + if agent._session_db and agent.session_id: + try: + # Ensure the session row exists before attempting UPDATE. + # Under concurrent load (cron/kanban), the initial + # _ensure_db_session() may have failed due to SQLite + # locking. Retry here so per-call token deltas are + # not silently lost (UPDATE on a non-existent row + # affects 0 rows without error). + if not agent._session_db_created: + agent._ensure_db_session() + agent._session_db.update_token_counts( + agent.session_id, + input_tokens=canonical_usage.input_tokens, + output_tokens=canonical_usage.output_tokens, + cache_read_tokens=canonical_usage.cache_read_tokens, + cache_write_tokens=canonical_usage.cache_write_tokens, + reasoning_tokens=canonical_usage.reasoning_tokens, + estimated_cost_usd=float(cost_result.amount_usd) + if cost_result.amount_usd is not None else None, + cost_status=cost_result.status, + cost_source=cost_result.source, + billing_provider=agent.provider, + billing_base_url=agent.base_url, + billing_mode="subscription_included" + if cost_result.status == "included" else None, + model=agent.model, + api_call_count=1, + ) + except Exception as e: + # Log token persistence failures so they're + # visible in agent.log — silent loss here is + # the root cause of undercounted analytics. + logger.debug( + "Token persistence failed (session=%s, tokens=%d): %s", + agent.session_id, total_tokens, e, + ) + + if agent.verbose_logging: + logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}") + + # Surface cache hit stats for any provider that reports + # them — not just those where we inject cache_control + # markers. OpenAI/Kimi/DeepSeek/Qwen all do automatic + # server-side prefix caching and return + # ``prompt_tokens_details.cached_tokens``; users + # previously could not see their cache % because this + # line was gated on ``_use_prompt_caching``, which is + # only True for Anthropic-style marker injection. + # ``canonical_usage`` is already normalised from all + # three API shapes (Anthropic / Codex / OpenAI-chat) + # so we can rely on its values directly. + cached = canonical_usage.cache_read_tokens + written = canonical_usage.cache_write_tokens + prompt = usage_dict["prompt_tokens"] + if (cached or written) and not agent.quiet_mode: + hit_pct = (cached / prompt * 100) if prompt > 0 else 0 + agent._vprint( + f"{agent.log_prefix} 💾 Cache: " + f"{cached:,}/{prompt:,} tokens " + f"({hit_pct:.0f}% hit, {written:,} written)" + ) + + has_retried_429 = False # Reset on success + # Clear Nous rate limit state on successful request — + # proves the limit has reset and other sessions can + # resume hitting Nous. + if agent.provider == "nous": + try: + from agent.nous_rate_guard import clear_nous_rate_limit + clear_nous_rate_limit() + except Exception: + pass + agent._touch_activity(f"API call #{api_call_count} completed") + break # Success, exit retry loop + + except InterruptedError: + if thinking_spinner: + thinking_spinner.stop("") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + api_elapsed = time.time() - api_start_time + agent._vprint(f"{agent.log_prefix}⚡ Interrupted during API call.", force=True) + agent._persist_session(messages, conversation_history) + interrupted = True + final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)." + break + + except Exception as api_error: + # Stop spinner before printing error messages + if thinking_spinner: + thinking_spinner.stop("(╥_╥) error, retrying...") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + + # ----------------------------------------------------------- + # UnicodeEncodeError recovery. Two common causes: + # 1. Lone surrogates (U+D800..U+DFFF) from clipboard paste + # (Google Docs, rich-text editors) — sanitize and retry. + # 2. ASCII codec on systems with LANG=C or non-UTF-8 locale + # (e.g. Chromebooks) — any non-ASCII character fails. + # Detect via the error message mentioning 'ascii' codec. + # We sanitize messages in-place and may retry twice: + # first to strip surrogates, then once more for pure + # ASCII-only locale sanitization if needed. + # ----------------------------------------------------------- + if isinstance(api_error, UnicodeEncodeError) and getattr(agent, '_unicode_sanitization_passes', 0) < 2: + _err_str = str(api_error).lower() + _is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str + # Detect surrogate errors — utf-8 codec refusing to + # encode U+D800..U+DFFF. The error text is: + # "'utf-8' codec can't encode characters in position + # N-M: surrogates not allowed" + _is_surrogate_error = ( + "surrogate" in _err_str + or ("'utf-8'" in _err_str and not _is_ascii_codec) + ) + # Sanitize surrogates from both the canonical `messages` + # list AND `api_messages` (the API-copy, which may carry + # `reasoning_content`/`reasoning_details` transformed + # from `reasoning` — fields the canonical list doesn't + # have directly). Also clean `api_kwargs` if built and + # `prefill_messages` if present. Mirrors the ASCII + # codec recovery below. + _surrogates_found = _sanitize_messages_surrogates(messages) + if isinstance(api_messages, list): + if _sanitize_messages_surrogates(api_messages): + _surrogates_found = True + if isinstance(api_kwargs, dict): + if _sanitize_structure_surrogates(api_kwargs): + _surrogates_found = True + if isinstance(getattr(agent, "prefill_messages", None), list): + if _sanitize_messages_surrogates(agent.prefill_messages): + _surrogates_found = True + # Gate the retry on the error type, not on whether we + # found anything — _force_ascii_payload / the extended + # surrogate walker above cover all known paths, but a + # new transformed field could still slip through. If + # the error was a surrogate encode failure, always let + # the retry run; the proactive sanitizer at line ~8781 + # runs again on the next iteration. Bounded by + # _unicode_sanitization_passes < 2 (outer guard). + if _surrogates_found or _is_surrogate_error: + agent._unicode_sanitization_passes += 1 + if _surrogates_found: + agent._vprint( + f"{agent.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...", + force=True, + ) + else: + agent._vprint( + f"{agent.log_prefix}⚠️ Surrogate encoding error — retrying after full-payload sanitization...", + force=True, + ) + continue + if _is_ascii_codec: + agent._force_ascii_payload = True + # ASCII codec: the system encoding can't handle + # non-ASCII characters at all. Sanitize all + # non-ASCII content from messages/tool schemas and retry. + # Sanitize both the canonical `messages` list and + # `api_messages` (the API-copy built before the retry + # loop, which may contain extra fields like + # reasoning_content that are not in `messages`). + _messages_sanitized = _sanitize_messages_non_ascii(messages) + if isinstance(api_messages, list): + _sanitize_messages_non_ascii(api_messages) + # Also sanitize the last api_kwargs if already built, + # so a leftover non-ASCII value in a transformed field + # (e.g. extra_body, reasoning_content) doesn't survive + # into the next attempt via _build_api_kwargs cache paths. + if isinstance(api_kwargs, dict): + _sanitize_structure_non_ascii(api_kwargs) + _prefill_sanitized = False + if isinstance(getattr(agent, "prefill_messages", None), list): + _prefill_sanitized = _sanitize_messages_non_ascii(agent.prefill_messages) + + _tools_sanitized = False + if isinstance(getattr(agent, "tools", None), list): + _tools_sanitized = _sanitize_tools_non_ascii(agent.tools) + + _system_sanitized = False + if isinstance(active_system_prompt, str): + _sanitized_system = _strip_non_ascii(active_system_prompt) + if _sanitized_system != active_system_prompt: + active_system_prompt = _sanitized_system + agent._cached_system_prompt = _sanitized_system + _system_sanitized = True + if isinstance(getattr(agent, "ephemeral_system_prompt", None), str): + _sanitized_ephemeral = _strip_non_ascii(agent.ephemeral_system_prompt) + if _sanitized_ephemeral != agent.ephemeral_system_prompt: + agent.ephemeral_system_prompt = _sanitized_ephemeral + _system_sanitized = True + + _headers_sanitized = False + _default_headers = ( + agent._client_kwargs.get("default_headers") + if isinstance(getattr(agent, "_client_kwargs", None), dict) + else None + ) + if isinstance(_default_headers, dict): + _headers_sanitized = _sanitize_structure_non_ascii(_default_headers) + + # Sanitize the API key — non-ASCII characters in + # credentials (e.g. ʋ instead of v from a bad + # copy-paste) cause httpx to fail when encoding + # the Authorization header as ASCII. This is the + # most common cause of persistent UnicodeEncodeError + # that survives message/tool sanitization (#6843). + _credential_sanitized = False + _raw_key = getattr(agent, "api_key", None) or "" + if _raw_key: + _clean_key = _strip_non_ascii(_raw_key) + if _clean_key != _raw_key: + agent.api_key = _clean_key + if isinstance(getattr(agent, "_client_kwargs", None), dict): + agent._client_kwargs["api_key"] = _clean_key + # Also update the live client — it holds its + # own copy of api_key which auth_headers reads + # dynamically on every request. + if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"): + agent.client.api_key = _clean_key + _credential_sanitized = True + agent._vprint( + f"{agent.log_prefix}⚠️ API key contained non-ASCII characters " + f"(bad copy-paste?) — stripped them. If auth fails, " + f"re-copy the key from your provider's dashboard.", + force=True, + ) + + # Always retry on ASCII codec detection — + # _force_ascii_payload guarantees the full + # api_kwargs payload is sanitized on the + # next iteration (line ~8475). Even when + # per-component checks above find nothing + # (e.g. non-ASCII only in api_messages' + # reasoning_content), the flag catches it. + # Bounded by _unicode_sanitization_passes < 2. + agent._unicode_sanitization_passes += 1 + _any_sanitized = ( + _messages_sanitized + or _prefill_sanitized + or _tools_sanitized + or _system_sanitized + or _headers_sanitized + or _credential_sanitized + ) + if _any_sanitized: + agent._vprint( + f"{agent.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from request payload. Retrying...", + force=True, + ) + else: + agent._vprint( + f"{agent.log_prefix}⚠️ System encoding is ASCII — enabling full-payload sanitization for retry...", + force=True, + ) + continue + + # ── Image-rejection recovery ────────────────────────────── + # Some providers (mlx-lm, text-only endpoints, text-only + # fallbacks on multimodal models) reject any message that + # contains image_url content with a 4xx error like + # "Only 'text' content type is supported." On first hit, + # strip all images from the message list, mark the session + # as vision-unsupported, and retry with text only. + # + # Detection is best-effort English phrase matching — a + # locale-translated or heavily-reworded upstream error + # will bypass this guard and fall through to the normal + # error handler. Expand the phrase list when new + # provider wordings are observed in the wild. + _err_body = "" + try: + _err_body = str(getattr(api_error, "body", None) or + getattr(api_error, "message", None) or + str(api_error)) + except Exception: + pass + _err_status = getattr(api_error, "status_code", None) + _IMAGE_REJECTION_PHRASES = ( + "only 'text' content type is supported", + "only text content type is supported", + "image_url is not supported", + "image content is not supported", + "multimodal is not supported", + "multimodal content is not supported", + "multimodal input is not supported", + "vision is not supported", + "vision input is not supported", + "does not support images", + "does not support image input", + "does not support multimodal", + "does not support vision", + "model does not support image", + # ChatGPT-account Codex backend + # (https://chatgpt.com/backend-api/codex) rejects + # data:image/...base64 URLs in input_image fields + # with HTTP 400 "Invalid 'input[N].content[K].image_url'. + # Expected a valid URL, but got a value with an + # invalid format." The OpenAI Responses API on the + # public endpoint accepts data URLs, but the + # ChatGPT-account variant does not. Without this + # phrase the agent cascaded into compression / + # context-too-large recovery instead of just + # stripping the images. Match is narrow on + # purpose — keyed on the field-path apostrophe so + # we don't false-trip on other URL validation + # errors. (issue #23570) + "image_url'. expected", + # DeepSeek's OpenAI-compatible API reports text-only + # request-body variants as: + # "unknown variant `image_url`, expected `text`". + "unknown variant `image_url`, expected `text`", + "unknown variant image_url, expected text", + ) + _err_lower = _err_body.lower() + _looks_like_image_rejection = any( + p in _err_lower for p in _IMAGE_REJECTION_PHRASES + ) + # 4xx-only gate: never interpret 5xx/timeout as "server + # said no to images" — those are transient and must + # route to the normal retry path. + _status_ok = _err_status is None or (400 <= int(_err_status) < 500) + if ( + getattr(agent, "_vision_supported", True) + and _looks_like_image_rejection + and _status_ok + ): + agent._vision_supported = False + _imgs_removed = _strip_images_from_messages(messages) + if isinstance(api_messages, list): + _strip_images_from_messages(api_messages) + agent._vprint( + f"{agent.log_prefix}⚠️ Server rejected image content — " + f"switching to text-only mode for this session" + + (". Stripped images from history and retrying." if _imgs_removed else "."), + force=True, + ) + continue + + status_code = getattr(api_error, "status_code", None) + error_context = agent._extract_api_error_context(api_error) + + # ── Classify the error for structured recovery decisions ── + _compressor = getattr(agent, "context_compressor", None) + _ctx_len = getattr(_compressor, "context_length", 200000) if _compressor else 200000 + classified = classify_api_error( + api_error, + provider=getattr(agent, "provider", "") or "", + model=getattr(agent, "model", "") or "", + approx_tokens=approx_tokens, + context_length=_ctx_len, + num_messages=len(api_messages) if api_messages else 0, + ) + logger.debug( + "Error classified: reason=%s status=%s retryable=%s compress=%s rotate=%s fallback=%s", + classified.reason.value, classified.status_code, + classified.retryable, classified.should_compress, + classified.should_rotate_credential, classified.should_fallback, + ) + + recovered_with_pool, has_retried_429 = agent._recover_with_credential_pool( + status_code=status_code, + has_retried_429=has_retried_429, + classified_reason=classified.reason, + error_context=error_context, + ) + if recovered_with_pool: + continue + + # Image-too-large recovery: shrink oversized native image + # parts in-place and retry once. Triggered by Anthropic's + # per-image 5 MB ceiling (400 with "image exceeds 5 MB + # maximum") or any other provider that complains about + # image size. If shrink fails or a second attempt still + # fails, fall through to normal error handling. + if ( + classified.reason == FailoverReason.image_too_large + and not image_shrink_retry_attempted + ): + image_shrink_retry_attempted = True + if agent._try_shrink_image_parts_in_messages(api_messages): + agent._vprint( + f"{agent.log_prefix}📐 Image(s) exceeded provider size limit — " + f"shrank and retrying...", + force=True, + ) + continue + else: + logger.info( + "image-shrink recovery: no data-URL image parts found " + "or shrink didn't reduce size; surfacing original error." + ) + + # Anthropic OAuth subscription rejected the 1M-context beta + # header ("long context beta is not yet available for this + # subscription"). Disable the beta for the rest of this + # session, rebuild the client, and retry once. 1M-capable + # subscriptions never hit this branch — they accept the + # beta and keep full 1M context. See PR #17680 for the + # original report (we chose reactive recovery over the + # proposed unconditional omit so capable subscriptions + # don't silently lose the capability). + if ( + classified.reason == FailoverReason.oauth_long_context_beta_forbidden + and agent.api_mode == "anthropic_messages" + and agent._is_anthropic_oauth + and not oauth_1m_beta_retry_attempted + ): + oauth_1m_beta_retry_attempted = True + if not getattr(agent, "_oauth_1m_beta_disabled", False): + agent._oauth_1m_beta_disabled = True + try: + agent._anthropic_client.close() + except Exception: + pass + agent._rebuild_anthropic_client() + agent._vprint( + f"{agent.log_prefix}🔕 OAuth subscription doesn't support " + f"the 1M-context beta — disabled for this session and retrying...", + force=True, + ) + continue + + if ( + agent.api_mode == "codex_responses" + and agent.provider in {"openai-codex", "xai-oauth"} + and status_code == 401 + and not codex_auth_retry_attempted + ): + codex_auth_retry_attempted = True + if agent._try_refresh_codex_client_credentials(force=True): + _label = "xAI OAuth" if agent.provider == "xai-oauth" else "Codex" + agent._vprint(f"{agent.log_prefix}🔐 {_label} auth refreshed after 401. Retrying request...") + continue + if ( + agent.api_mode == "chat_completions" + and agent.provider == "nous" + and status_code == 401 + and not nous_auth_retry_attempted + ): + nous_auth_retry_attempted = True + if agent._try_refresh_nous_client_credentials(force=True): + print(f"{agent.log_prefix}🔐 Nous agent key refreshed after 401. Retrying request...") + continue + # Credential refresh didn't help — show diagnostic info. + # Most common causes: Portal OAuth expired/revoked, + # account out of credits, or agent key blocked. + from hermes_constants import display_hermes_home as _dhh_fn + _dhh = _dhh_fn() + _body_text = "" + try: + _body = getattr(api_error, "body", None) or getattr(api_error, "response", None) + if _body is not None: + _body_text = str(_body)[:200] + except Exception: + pass + print(f"{agent.log_prefix}🔐 Nous 401 — Portal authentication failed.") + if _body_text: + print(f"{agent.log_prefix} Response: {_body_text}") + print(f"{agent.log_prefix} Most likely: Portal OAuth expired, account out of credits, or agent key revoked.") + print(f"{agent.log_prefix} Troubleshooting:") + print(f"{agent.log_prefix} • Re-authenticate: hermes login --provider nous") + print(f"{agent.log_prefix} • Check credits / billing: https://portal.nousresearch.com") + print(f"{agent.log_prefix} • Verify stored credentials: {_dhh}/auth.json") + print(f"{agent.log_prefix} • Switch providers temporarily: /model --provider openrouter") + if ( + agent.provider == "copilot" + and status_code == 401 + and not copilot_auth_retry_attempted + ): + copilot_auth_retry_attempted = True + if agent._try_refresh_copilot_client_credentials(): + agent._vprint(f"{agent.log_prefix}🔐 Copilot credentials refreshed after 401. Retrying request...") + continue + if ( + agent.api_mode == "anthropic_messages" + and status_code == 401 + and hasattr(agent, '_anthropic_api_key') + and not anthropic_auth_retry_attempted + ): + anthropic_auth_retry_attempted = True + from agent.anthropic_adapter import _is_oauth_token + if agent._try_refresh_anthropic_client_credentials(): + print(f"{agent.log_prefix}🔐 Anthropic credentials refreshed after 401. Retrying request...") + continue + # Credential refresh didn't help — show diagnostic info + key = agent._anthropic_api_key + auth_method = "Bearer (OAuth/setup-token)" if _is_oauth_token(key) else "x-api-key (API key)" + print(f"{agent.log_prefix}🔐 Anthropic 401 — authentication failed.") + print(f"{agent.log_prefix} Auth method: {auth_method}") + print(f"{agent.log_prefix} Token prefix: {key[:12]}..." if key and len(key) > 12 else f"{agent.log_prefix} Token: (empty or short)") + print(f"{agent.log_prefix} Troubleshooting:") + from hermes_constants import display_hermes_home as _dhh_fn + _dhh = _dhh_fn() + print(f"{agent.log_prefix} • Check ANTHROPIC_TOKEN in {_dhh}/.env for Hermes-managed OAuth/setup tokens") + print(f"{agent.log_prefix} • Check ANTHROPIC_API_KEY in {_dhh}/.env for API keys or legacy token values") + print(f"{agent.log_prefix} • For API keys: verify at https://platform.claude.com/settings/keys") + print(f"{agent.log_prefix} • For Claude Code: run 'claude /login' to refresh, then retry") + print(f"{agent.log_prefix} • Legacy cleanup: hermes config set ANTHROPIC_TOKEN \"\"") + print(f"{agent.log_prefix} • Clear stale keys: hermes config set ANTHROPIC_API_KEY \"\"") + + # ── Thinking block signature recovery ───────────────── + # Anthropic signs thinking blocks against the full turn + # content. Any upstream mutation (context compression, + # session truncation, message merging) invalidates the + # signature → HTTP 400. Recovery: strip reasoning_details + # from all messages so the next retry sends no thinking + # blocks at all. One-shot — don't retry infinitely. + if ( + classified.reason == FailoverReason.thinking_signature + and not thinking_sig_retry_attempted + ): + thinking_sig_retry_attempted = True + for _m in messages: + if isinstance(_m, dict): + _m.pop("reasoning_details", None) + agent._vprint( + f"{agent.log_prefix}⚠️ Thinking block signature invalid — " + f"stripped all thinking blocks, retrying...", + force=True, + ) + logging.warning( + "%sThinking block signature recovery: stripped " + "reasoning_details from %d messages", + agent.log_prefix, len(messages), + ) + continue + + # ── llama.cpp grammar-parse recovery ────────────────── + # llama.cpp's ``json-schema-to-grammar`` converter rejects + # regex escape classes (``\d``, ``\w``, ``\s``) and most + # ``format`` values in tool schemas. MCP servers emit + # these routinely for date/phone/email params. Recovery: + # strip ``pattern``/``format`` from ``agent.tools`` and + # retry once. We keep the keywords by default so cloud + # providers get the full prompting hints; this branch + # fires only for users on llama.cpp's OAI server. + if ( + classified.reason == FailoverReason.llama_cpp_grammar_pattern + and not llama_cpp_grammar_retry_attempted + ): + llama_cpp_grammar_retry_attempted = True + try: + from tools.schema_sanitizer import strip_pattern_and_format + _, _stripped = strip_pattern_and_format(agent.tools) + except Exception as _strip_exc: # pragma: no cover — defensive + logging.warning( + "%sllama.cpp grammar recovery: strip helper failed: %s", + agent.log_prefix, _strip_exc, + ) + _stripped = 0 + if _stripped: + agent._vprint( + f"{agent.log_prefix}⚠️ llama.cpp rejected tool schema grammar — " + f"stripped {_stripped} pattern/format keyword(s), retrying...", + force=True, + ) + logging.warning( + "%sllama.cpp grammar recovery: stripped %d " + "pattern/format keyword(s) from tool schemas", + agent.log_prefix, _stripped, + ) + continue + # No keywords found to strip — fall through to normal + # retry path rather than loop forever on the same error. + logging.warning( + "%sllama.cpp grammar error but no pattern/format " + "keywords to strip — falling through to normal retry", + agent.log_prefix, + ) + + retry_count += 1 + elapsed_time = time.time() - api_start_time + agent._touch_activity( + f"API error recovery (attempt {retry_count}/{max_retries})" + ) + + error_type = type(api_error).__name__ + error_msg = str(api_error).lower() + _error_summary = agent._summarize_api_error(api_error) + logger.warning( + "API call failed (attempt %s/%s) error_type=%s %s summary=%s", + retry_count, + max_retries, + error_type, + agent._client_log_context(), + _error_summary, + ) + + _provider = getattr(agent, "provider", "unknown") + _base = getattr(agent, "base_url", "unknown") + _model = getattr(agent, "model", "unknown") + _status_code_str = f" [HTTP {status_code}]" if status_code else "" + agent._vprint(f"{agent.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}{_status_code_str}", force=True) + agent._vprint(f"{agent.log_prefix} 🔌 Provider: {_provider} Model: {_model}", force=True) + agent._vprint(f"{agent.log_prefix} 🌐 Endpoint: {_base}", force=True) + agent._vprint(f"{agent.log_prefix} 📝 Error: {_error_summary}", force=True) + if status_code and status_code < 500: + _err_body = getattr(api_error, "body", None) + _err_body_str = str(_err_body)[:300] if _err_body else None + if _err_body_str: + agent._vprint(f"{agent.log_prefix} 📋 Details: {_err_body_str}", force=True) + agent._vprint(f"{agent.log_prefix} ⏱️ Elapsed: {elapsed_time:.2f}s Context: {len(api_messages)} msgs, ~{approx_tokens:,} tokens") + + # Actionable hint for OpenRouter "no tool endpoints" error. + # This fires regardless of whether fallback succeeds — the + # user needs to know WHY their model failed so they can fix + # their provider routing, not just silently fall back. + if ( + agent._is_openrouter_url() + and "support tool use" in error_msg + ): + agent._vprint( + f"{agent.log_prefix} 💡 No OpenRouter providers for {_model} support tool calling with your current settings.", + force=True, + ) + if agent.providers_allowed: + agent._vprint( + f"{agent.log_prefix} Your provider_routing.only restriction is filtering out tool-capable providers.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} Try removing the restriction or adding providers that support tools for this model.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} Check which providers support tools: https://openrouter.ai/models/{_model}", + force=True, + ) + + # Check for interrupt before deciding to retry + if agent._interrupt_requested: + agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during error handling, aborting retries.", force=True) + agent._persist_session(messages, conversation_history) + agent.clear_interrupt() + return { + "final_response": f"Operation interrupted: handling API error ({error_type}: {agent._clean_error_message(str(api_error))}).", + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "interrupted": True, + } + + # Check for 413 payload-too-large BEFORE generic 4xx handler. + # A 413 is a payload-size error — the correct response is to + # compress history and retry, not abort immediately. + status_code = getattr(api_error, "status_code", None) + + # ── Anthropic Sonnet long-context tier gate ─────────── + # Anthropic returns HTTP 429 "Extra usage is required for + # long context requests" when a Claude Max (or similar) + # subscription doesn't include the 1M-context tier. This + # is NOT a transient rate limit — retrying or switching + # credentials won't help. Reduce context to 200k (the + # standard tier) and compress. + if classified.reason == FailoverReason.long_context_tier: + _reduced_ctx = 200000 + compressor = agent.context_compressor + old_ctx = compressor.context_length + if old_ctx > _reduced_ctx: + compressor.update_model( + model=agent.model, + context_length=_reduced_ctx, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + provider=agent.provider, + ) + # Context probing flags — only set on built-in + # compressor (plugin engines manage their own). + if hasattr(compressor, "_context_probed"): + compressor._context_probed = True + # Don't persist — this is a subscription-tier + # limitation, not a model capability. If the + # user later enables extra usage the 1M limit + # should come back automatically. + compressor._context_probe_persistable = False + agent._vprint( + f"{agent.log_prefix}⚠️ Anthropic long-context tier " + f"requires extra usage — reducing context: " + f"{old_ctx:,} → {_reduced_ctx:,} tokens", + force=True, + ) + + compression_attempts += 1 + if compression_attempts <= max_compression_attempts: + original_len = len(messages) + messages, active_system_prompt = agent._compress_context( + messages, system_message, + approx_tokens=approx_tokens, + task_id=effective_task_id, + ) + # Compression created a new session — clear history + # so _flush_messages_to_session_db writes compressed + # messages to the new session, not skipping them. + conversation_history = None + if len(messages) < original_len or old_ctx > _reduced_ctx: + agent._emit_status( + f"🗜️ Context reduced to {_reduced_ctx:,} tokens " + f"(was {old_ctx:,}), retrying..." + ) + time.sleep(2) + restart_with_compressed_messages = True + break + # Fall through to normal error handling if compression + # is exhausted or didn't help. + + # Eager fallback for rate-limit errors (429 or quota exhaustion). + # When a fallback model is configured, switch immediately instead + # of burning through retries with exponential backoff -- the + # primary provider won't recover within the retry window. + is_rate_limited = classified.reason in { + FailoverReason.rate_limit, + FailoverReason.billing, + } + if is_rate_limited and agent._fallback_index < len(agent._fallback_chain): + # Don't eagerly fallback if credential pool rotation may + # still recover. See _pool_may_recover_from_rate_limit + # for the single-credential-pool and CloudCode-quota + # exceptions. Fixes #11314 and #13636. + pool_may_recover = _pool_may_recover_from_rate_limit( + agent._credential_pool, + provider=agent.provider, + base_url=getattr(agent, "base_url", None), + ) + if not pool_may_recover: + agent._emit_status("⚠️ Rate limited — switching to fallback provider...") + if agent._try_activate_fallback(reason=classified.reason): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + + # ── Nous Portal: record rate limit & skip retries ───── + # When Nous returns a 429 that is a genuine account- + # level rate limit, record the reset time to a shared + # file so ALL sessions (cron, gateway, auxiliary) know + # not to pile on, then skip further retries -- each + # one burns another RPH request and deepens the hole. + # The retry loop's top-of-iteration guard will catch + # this on the next pass and try fallback or bail. + # + # IMPORTANT: Nous Portal multiplexes multiple upstream + # providers (DeepSeek, Kimi, MiMo, Hermes). A 429 can + # also mean an UPSTREAM provider is out of capacity + # for one specific model -- transient, clears in + # seconds, nothing to do with the caller's quota. + # Tripping the cross-session breaker on that would + # block every Nous model for minutes. We use + # ``is_genuine_nous_rate_limit`` to tell the two + # apart via the 429's own x-ratelimit-* headers and + # the last-known-good state captured on the previous + # successful response. + if ( + is_rate_limited + and agent.provider == "nous" + and classified.reason == FailoverReason.rate_limit + and not recovered_with_pool + ): + _genuine_nous_rate_limit = False + try: + from agent.nous_rate_guard import ( + is_genuine_nous_rate_limit, + record_nous_rate_limit, + ) + _err_resp = getattr(api_error, "response", None) + _err_hdrs = ( + getattr(_err_resp, "headers", None) + if _err_resp else None + ) + _genuine_nous_rate_limit = is_genuine_nous_rate_limit( + headers=_err_hdrs, + last_known_state=agent._rate_limit_state, + ) + if _genuine_nous_rate_limit: + record_nous_rate_limit( + headers=_err_hdrs, + error_context=error_context, + ) + else: + logging.info( + "Nous 429 looks like upstream capacity " + "(no exhausted bucket in headers or " + "last-known state) -- not tripping " + "cross-session breaker." + ) + except Exception: + pass + if _genuine_nous_rate_limit: + # Skip straight to max_retries -- the + # top-of-loop guard will handle fallback or + # bail cleanly. + retry_count = max_retries + continue + # Upstream capacity 429: fall through to normal + # retry logic. A different model (or the same + # model a moment later) will typically succeed. + + is_payload_too_large = ( + classified.reason == FailoverReason.payload_too_large + ) + + # Actionable hint for GitHub Models (Azure) 413 errors. + # The free tier enforces a hard 8K token cap per request, + # which Hermes' system prompt + tool schemas alone exceed. + # Compression can't help — the floor is the system prompt + # itself, not the conversation — so surface a clear "not + # compatible" message instead of looping into three futile + # compression attempts. + if ( + status_code == 413 + and isinstance(agent.base_url, str) + and "models.inference.ai.azure.com" in agent.base_url + ): + agent._vprint( + f"{agent.log_prefix} 💡 GitHub Models free tier (models.inference.ai.azure.com) caps every", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} request at ~8K tokens. Hermes' system prompt + tool schemas baseline", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} exceeds that floor, so this endpoint cannot run an agentic loop.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} Use the `copilot` provider with a Copilot subscription token (`hermes", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} setup` → GitHub Copilot), or pick any other provider.", + force=True, + ) + + if is_payload_too_large: + compression_attempts += 1 + if compression_attempts > max_compression_attempts: + agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True) + agent._vprint(f"{agent.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True) + logging.error(f"{agent.log_prefix}413 compression failed after {max_compression_attempts} attempts.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + agent._emit_status(f"⚠️ Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...") + + original_len = len(messages) + messages, active_system_prompt = agent._compress_context( + messages, system_message, approx_tokens=approx_tokens, + task_id=effective_task_id, + ) + # Compression created a new session — clear history + # so _flush_messages_to_session_db writes compressed + # messages to the new session, not skipping them. + conversation_history = None + + if len(messages) < original_len: + agent._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") + time.sleep(2) # Brief pause between compression retries + restart_with_compressed_messages = True + break + else: + agent._vprint(f"{agent.log_prefix}❌ Payload too large and cannot compress further.", force=True) + agent._vprint(f"{agent.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True) + logging.error(f"{agent.log_prefix}413 payload too large. Cannot compress further.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": "Request payload too large (413). Cannot compress further.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + + # Check for context-length errors BEFORE generic 4xx handler. + # The classifier detects context overflow from: explicit error + # messages, generic 400 + large session heuristic (#1630), and + # server disconnect + large session pattern (#2153). + is_context_length_error = ( + classified.reason == FailoverReason.context_overflow + ) + + if is_context_length_error: + compressor = agent.context_compressor + old_ctx = compressor.context_length + + # ── Distinguish two very different errors ─────────── + # 1. "Prompt too long": the INPUT exceeds the context window. + # Fix: reduce context_length + compress history. + # 2. "max_tokens too large": input is fine, but + # input_tokens + requested max_tokens > context_window. + # Fix: reduce max_tokens (the OUTPUT cap) for this call. + # Do NOT shrink context_length — the window is unchanged. + # + # Note: max_tokens = output token cap (one response). + # context_length = total window (input + output combined). + available_out = parse_available_output_tokens_from_error(error_msg) + if available_out is not None: + # Error is purely about the output cap being too large. + # Cap output to the available space and retry without + # touching context_length or triggering compression. + safe_out = max(1, available_out - 64) # small safety margin + agent._ephemeral_max_output_tokens = safe_out + agent._vprint( + f"{agent.log_prefix}⚠️ Output cap too large for current prompt — " + f"retrying with max_tokens={safe_out:,} " + f"(available_tokens={available_out:,}; context_length unchanged at {old_ctx:,})", + force=True, + ) + # Still count against compression_attempts so we don't + # loop forever if the error keeps recurring. + compression_attempts += 1 + if compression_attempts > max_compression_attempts: + agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) + agent._vprint(f"{agent.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True) + logging.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + restart_with_compressed_messages = True + break + + # Error is about the INPUT being too large — reduce context_length. + # Try to parse the actual limit from the error message + parsed_limit = parse_context_limit_from_error(error_msg) + _provider_lower = (getattr(agent, "provider", "") or "").lower() + _base_lower = (getattr(agent, "base_url", "") or "").rstrip("/").lower() + is_minimax_provider = ( + _provider_lower in {"minimax", "minimax-cn"} + or _base_lower.startswith(( + "https://api.minimax.io/anthropic", + "https://api.minimaxi.com/anthropic", + )) + ) + minimax_delta_only_overflow = ( + is_minimax_provider + and parsed_limit is None + and "context window exceeds limit (" in error_msg + ) + if parsed_limit and parsed_limit < old_ctx: + new_ctx = parsed_limit + agent._vprint(f"{agent.log_prefix}Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True) + elif minimax_delta_only_overflow: + new_ctx = old_ctx + agent._vprint( + f"{agent.log_prefix}Provider reported overflow amount only; " + f"keeping context_length at {old_ctx:,} tokens and compressing.", + force=True, + ) + else: + # Step down to the next probe tier + new_ctx = get_next_probe_tier(old_ctx) + + if new_ctx and new_ctx < old_ctx: + compressor.update_model( + model=agent.model, + context_length=new_ctx, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + provider=agent.provider, + ) + # Context probing flags — only set on built-in + # compressor (plugin engines manage their own). + if hasattr(compressor, "_context_probed"): + compressor._context_probed = True + # Only persist limits parsed from the provider's + # error message (a real number). Guessed fallback + # tiers from get_next_probe_tier() should stay + # in-memory only — persisting them pollutes the + # cache with wrong values. + compressor._context_probe_persistable = bool( + parsed_limit and parsed_limit == new_ctx + ) + agent._vprint(f"{agent.log_prefix}⚠️ Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens", force=True) + else: + agent._vprint(f"{agent.log_prefix}⚠️ Context length exceeded at minimum tier — attempting compression...", force=True) + + compression_attempts += 1 + if compression_attempts > max_compression_attempts: + agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) + agent._vprint(f"{agent.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True) + logging.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + agent._emit_status(f"🗜️ Context too large (~{approx_tokens:,} tokens) — compressing ({compression_attempts}/{max_compression_attempts})...") + + original_len = len(messages) + messages, active_system_prompt = agent._compress_context( + messages, system_message, approx_tokens=approx_tokens, + task_id=effective_task_id, + ) + # Compression created a new session — clear history + # so _flush_messages_to_session_db writes compressed + # messages to the new session, not skipping them. + conversation_history = None + + if len(messages) < original_len or new_ctx and new_ctx < old_ctx: + if len(messages) < original_len: + agent._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") + time.sleep(2) # Brief pause between compression retries + restart_with_compressed_messages = True + break + else: + # Can't compress further and already at minimum tier + agent._vprint(f"{agent.log_prefix}❌ Context length exceeded and cannot compress further.", force=True) + agent._vprint(f"{agent.log_prefix} 💡 The conversation has accumulated too much content. Try /new to start fresh, or /compress to manually trigger compression.", force=True) + logging.error(f"{agent.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + + # Check for non-retryable client errors. The classifier + # already accounts for 413, 429, 529 (transient), context + # overflow, and generic-400 heuristics. Local validation + # errors (ValueError, TypeError) are programming bugs. + # Exclude UnicodeEncodeError — it's a ValueError subclass + # but is handled separately by the surrogate sanitization + # path above. Exclude json.JSONDecodeError — also a + # ValueError subclass, but it indicates a transient + # provider/network failure (malformed response body, + # truncated stream, routing layer corruption), not a + # local programming bug, and should be retried (#14782). + is_local_validation_error = ( + isinstance(api_error, (ValueError, TypeError)) + and not isinstance( + api_error, (UnicodeEncodeError, json.JSONDecodeError) + ) + # ssl.SSLError (and its subclass SSLCertVerificationError) + # inherits from OSError *and* ValueError via Python MRO, + # so the isinstance(ValueError) check above would + # misclassify a TLS transport failure as a local + # programming bug and abort without retrying. Exclude + # ssl.SSLError explicitly so the error classifier's + # retryable=True mapping takes effect instead. + and not isinstance(api_error, ssl.SSLError) + ) + is_client_error = ( + is_local_validation_error + or ( + not classified.retryable + and not classified.should_compress + and classified.reason not in { + FailoverReason.rate_limit, + FailoverReason.billing, + FailoverReason.overloaded, + FailoverReason.context_overflow, + FailoverReason.payload_too_large, + FailoverReason.long_context_tier, + FailoverReason.thinking_signature, + } + ) + ) and not is_context_length_error + + if is_client_error: + # Try fallback before aborting — a different provider + # may not have the same issue (rate limit, auth, etc.) + agent._emit_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + if api_kwargs is not None: + agent._dump_api_request_debug( + api_kwargs, reason="non_retryable_client_error", error=api_error, + ) + agent._emit_status( + f"❌ Non-retryable error (HTTP {status_code}): " + f"{agent._summarize_api_error(api_error)}" + ) + agent._vprint(f"{agent.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True) + agent._vprint(f"{agent.log_prefix} 🔌 Provider: {_provider} Model: {_model}", force=True) + agent._vprint(f"{agent.log_prefix} 🌐 Endpoint: {_base}", force=True) + # Actionable guidance for common auth errors + if classified.is_auth or classified.reason == FailoverReason.billing: + if _provider in {"openai-codex", "xai-oauth"} and status_code == 401: + if _provider == "openai-codex": + agent._vprint(f"{agent.log_prefix} 💡 Codex OAuth token was rejected (HTTP 401). Your token may have been", force=True) + agent._vprint(f"{agent.log_prefix} refreshed by another client (Codex CLI, VS Code). To fix:", force=True) + agent._vprint(f"{agent.log_prefix} 1. Run `codex` in your terminal to generate fresh tokens.", force=True) + agent._vprint(f"{agent.log_prefix} 2. Then run `hermes auth` to re-authenticate.", force=True) + else: + agent._vprint(f"{agent.log_prefix} 💡 xAI OAuth token was rejected (HTTP 401). To fix:", force=True) + agent._vprint(f"{agent.log_prefix} re-authenticate with xAI Grok OAuth (SuperGrok Subscription) from `hermes model`.", force=True) + else: + agent._vprint(f"{agent.log_prefix} 💡 Your API key was rejected by the provider. Check:", force=True) + agent._vprint(f"{agent.log_prefix} • Is the key valid? Run: hermes setup", force=True) + agent._vprint(f"{agent.log_prefix} • Does your account have access to {_model}?", force=True) + if base_url_host_matches(str(_base), "openrouter.ai"): + agent._vprint(f"{agent.log_prefix} • Check credits: https://openrouter.ai/settings/credits", force=True) + else: + agent._vprint(f"{agent.log_prefix} 💡 This type of error won't be fixed by retrying.", force=True) + logging.error(f"{agent.log_prefix}Non-retryable client error: {api_error}") + # Skip session persistence when the error is likely + # context-overflow related (status 400 + large session). + # Persisting the failed user message would make the + # session even larger, causing the same failure on the + # next attempt. (#1630) + if status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80): + agent._vprint( + f"{agent.log_prefix}⚠️ Skipping session persistence " + f"for large failed session to prevent growth loop.", + force=True, + ) + else: + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": str(api_error), + } + + if retry_count >= max_retries: + # Before falling back, try rebuilding the primary + # client once for transient transport errors (stale + # connection pool, TCP reset). Only attempted once + # per API call block. + if not primary_recovery_attempted and agent._try_recover_primary_transport( + api_error, retry_count=retry_count, max_retries=max_retries, + ): + primary_recovery_attempted = True + retry_count = 0 + continue + # Try fallback before giving up entirely + agent._emit_status(f"⚠️ Max retries ({max_retries}) exhausted — trying fallback...") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + _final_summary = agent._summarize_api_error(api_error) + if is_rate_limited: + agent._emit_status(f"❌ Rate limited after {max_retries} retries — {_final_summary}") + else: + agent._emit_status(f"❌ API failed after {max_retries} retries — {_final_summary}") + agent._vprint(f"{agent.log_prefix} 💀 Final error: {_final_summary}", force=True) + + # Detect SSE stream-drop pattern (e.g. "Network + # connection lost") and surface actionable guidance. + # This typically happens when the model generates a + # very large tool call (write_file with huge content) + # and the proxy/CDN drops the stream mid-response. + _is_stream_drop = ( + not getattr(api_error, "status_code", None) + and any(p in error_msg for p in ( + "connection lost", "connection reset", + "connection closed", "network connection", + "network error", "terminated", + )) + ) + if _is_stream_drop: + agent._vprint( + f"{agent.log_prefix} 💡 The provider's stream " + f"connection keeps dropping. This often happens " + f"when the model tries to write a very large " + f"file in a single tool call.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} Try asking the model " + f"to use execute_code with Python's open() for " + f"large files, or to write the file in smaller " + f"sections.", + force=True, + ) + + logging.error( + "%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s", + agent.log_prefix, max_retries, _final_summary, + _provider, _model, len(api_messages), f"{approx_tokens:,}", + ) + if api_kwargs is not None: + agent._dump_api_request_debug( + api_kwargs, reason="max_retries_exhausted", error=api_error, + ) + agent._persist_session(messages, conversation_history) + _final_response = f"API call failed after {max_retries} retries: {_final_summary}" + if _is_stream_drop: + _final_response += ( + "\n\nThe provider's stream connection keeps " + "dropping — this often happens when generating " + "very large tool call responses (e.g. write_file " + "with long content). Try asking me to use " + "execute_code with Python's open() for large " + "files, or to write in smaller sections." + ) + return { + "final_response": _final_response, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": _final_summary, + } + + # For rate limits, respect the Retry-After header if present + _retry_after = None + if is_rate_limited: + _resp_headers = getattr(getattr(api_error, "response", None), "headers", None) + if _resp_headers and hasattr(_resp_headers, "get"): + _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After") + if _ra_raw: + try: + _retry_after = min(float(_ra_raw), 120) # Cap at 2 minutes + except (TypeError, ValueError): + pass + wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0) + if is_rate_limited: + agent._emit_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...") + else: + agent._emit_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...") + logger.warning( + "Retrying API call in %ss (attempt %s/%s) %s error=%s", + wait_time, + retry_count, + max_retries, + agent._client_log_context(), + api_error, + ) + # Sleep in small increments so we can respond to interrupts quickly + # instead of blocking the entire wait_time in one sleep() call + sleep_end = time.time() + wait_time + _backoff_touch_counter = 0 + while time.time() < sleep_end: + if agent._interrupt_requested: + agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True) + agent._persist_session(messages, conversation_history) + agent.clear_interrupt() + return { + "final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).", + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "interrupted": True, + } + time.sleep(0.2) # Check interrupt every 200ms + # Touch activity every ~30s so the gateway's inactivity + # monitor knows we're alive during backoff waits. + _backoff_touch_counter += 1 + if _backoff_touch_counter % 150 == 0: # 150 × 0.2s = 30s + agent._touch_activity( + f"error retry backoff ({retry_count}/{max_retries}), " + f"{int(sleep_end - time.time())}s remaining" + ) + + # If the API call was interrupted, skip response processing + if interrupted: + _turn_exit_reason = "interrupted_during_api_call" + break + + if restart_with_compressed_messages: + api_call_count -= 1 + agent.iteration_budget.refund() + # Count compression restarts toward the retry limit to prevent + # infinite loops when compression reduces messages but not enough + # to fit the context window. + retry_count += 1 + restart_with_compressed_messages = False + continue + + if restart_with_length_continuation: + # Progressively boost the output token budget on each retry. + # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768. + # Applies to all providers via _ephemeral_max_output_tokens. + _boost_base = agent.max_tokens if agent.max_tokens else 4096 + _boost = _boost_base * (length_continue_retries + 1) + agent._ephemeral_max_output_tokens = min(_boost, 32768) + continue + + # Guard: if all retries exhausted without a successful response + # (e.g. repeated context-length errors that exhausted retry_count), + # the `response` variable is still None. Break out cleanly. + if response is None: + _turn_exit_reason = "all_retries_exhausted_no_response" + print(f"{agent.log_prefix}❌ All API retries exhausted with no successful response.") + agent._persist_session(messages, conversation_history) + break + + try: + _transport = agent._get_transport() + _normalize_kwargs = {} + if agent.api_mode == "anthropic_messages": + _normalize_kwargs["strip_tool_prefix"] = agent._is_anthropic_oauth + normalized = _transport.normalize_response(response, **_normalize_kwargs) + assistant_message = normalized + finish_reason = normalized.finish_reason + + # Normalize content to string — some OpenAI-compatible servers + # (llama-server, etc.) return content as a dict or list instead + # of a plain string, which crashes downstream .strip() calls. + if assistant_message.content is not None and not isinstance(assistant_message.content, str): + raw = assistant_message.content + if isinstance(raw, dict): + assistant_message.content = raw.get("text", "") or raw.get("content", "") or json.dumps(raw) + elif isinstance(raw, list): + # Multimodal content list — extract text parts + parts = [] + for part in raw: + if isinstance(part, str): + parts.append(part) + elif isinstance(part, dict) and part.get("type") == "text": + parts.append(part.get("text", "")) + elif isinstance(part, dict) and "text" in part: + parts.append(str(part["text"])) + assistant_message.content = "\n".join(parts) + else: + assistant_message.content = str(raw) + + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _assistant_tool_calls = getattr(assistant_message, "tool_calls", None) or [] + _assistant_text = assistant_message.content or "" + _invoke_hook( + "post_api_request", + task_id=effective_task_id, + session_id=agent.session_id or "", + platform=agent.platform or "", + model=agent.model, + provider=agent.provider, + base_url=agent.base_url, + api_mode=agent.api_mode, + api_call_count=api_call_count, + api_duration=api_duration, + finish_reason=finish_reason, + message_count=len(api_messages), + response_model=getattr(response, "model", None), + response=response, + usage=agent._usage_summary_for_api_request_hook(response), + assistant_message=assistant_message, + assistant_content_chars=len(_assistant_text), + assistant_tool_call_count=len(_assistant_tool_calls), + ) + except Exception: + pass + + # Handle assistant response + if assistant_message.content and not agent.quiet_mode: + if agent.verbose_logging: + agent._vprint(f"{agent.log_prefix}🤖 Assistant: {assistant_message.content}") + else: + agent._vprint(f"{agent.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}") + + # Notify progress callback of model's thinking (used by subagent + # delegation to relay the child's reasoning to the parent display). + if (assistant_message.content and agent.tool_progress_callback): + _think_text = assistant_message.content.strip() + # Strip reasoning XML tags that shouldn't leak to parent display + _think_text = re.sub( + r'', '', _think_text + ).strip() + # For subagents: relay first line to parent display (existing behaviour). + # For all agents with a structured callback: emit reasoning.available event. + first_line = _think_text.split('\n')[0][:80] if _think_text else "" + if first_line and getattr(agent, '_delegate_depth', 0) > 0: + try: + agent.tool_progress_callback("_thinking", first_line) + except Exception: + pass + elif _think_text: + try: + agent.tool_progress_callback("reasoning.available", "_thinking", _think_text[:500], None) + except Exception: + pass + + # Check for incomplete (opened but never closed) + # This means the model ran out of output tokens mid-reasoning — retry up to 2 times + if has_incomplete_scratchpad(assistant_message.content or ""): + agent._incomplete_scratchpad_retries += 1 + + agent._vprint(f"{agent.log_prefix}⚠️ Incomplete detected (opened but never closed)") + + if agent._incomplete_scratchpad_retries <= 2: + agent._vprint(f"{agent.log_prefix}🔄 Retrying API call ({agent._incomplete_scratchpad_retries}/2)...") + # Don't add the broken message, just retry + continue + else: + # Max retries - discard this turn and save as partial + agent._vprint(f"{agent.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True) + agent._incomplete_scratchpad_retries = 0 + + rolled_back_messages = agent._get_messages_up_to_last_assistant(messages) + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + + return { + "final_response": None, + "messages": rolled_back_messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Incomplete REASONING_SCRATCHPAD after 2 retries" + } + + # Reset incomplete scratchpad counter on clean response + agent._incomplete_scratchpad_retries = 0 + + if agent.api_mode == "codex_responses" and finish_reason == "incomplete": + agent._codex_incomplete_retries += 1 + + interim_msg = agent._build_assistant_message(assistant_message, finish_reason) + interim_has_content = bool((interim_msg.get("content") or "").strip()) + interim_has_reasoning = bool(interim_msg.get("reasoning", "").strip()) if isinstance(interim_msg.get("reasoning"), str) else False + interim_has_codex_reasoning = bool(interim_msg.get("codex_reasoning_items")) + interim_has_codex_message_items = bool(interim_msg.get("codex_message_items")) + + if ( + interim_has_content + or interim_has_reasoning + or interim_has_codex_reasoning + or interim_has_codex_message_items + ): + last_msg = messages[-1] if messages else None + # Duplicate detection: two consecutive incomplete assistant + # messages with identical content AND reasoning are collapsed. + # For provider-state-only changes (encrypted reasoning + # items or replayable message ids/phases/statuses differ + # while visible content/reasoning are unchanged), compare + # those opaque payloads too so we don't silently drop the + # newer continuation state. + last_codex_items = last_msg.get("codex_reasoning_items") if isinstance(last_msg, dict) else None + interim_codex_items = interim_msg.get("codex_reasoning_items") + last_codex_message_items = last_msg.get("codex_message_items") if isinstance(last_msg, dict) else None + interim_codex_message_items = interim_msg.get("codex_message_items") + duplicate_interim = ( + isinstance(last_msg, dict) + and last_msg.get("role") == "assistant" + and last_msg.get("finish_reason") == "incomplete" + and (last_msg.get("content") or "") == (interim_msg.get("content") or "") + and (last_msg.get("reasoning") or "") == (interim_msg.get("reasoning") or "") + and last_codex_items == interim_codex_items + and last_codex_message_items == interim_codex_message_items + ) + if not duplicate_interim: + messages.append(interim_msg) + agent._emit_interim_assistant_message(interim_msg) + + if agent._codex_incomplete_retries < 3: + if not agent.quiet_mode: + agent._vprint(f"{agent.log_prefix}↻ Codex response incomplete; continuing turn ({agent._codex_incomplete_retries}/3)") + agent._session_messages = messages + agent._save_session_log(messages) + continue + + agent._codex_incomplete_retries = 0 + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Codex response remained incomplete after 3 continuation attempts", + } + elif hasattr(agent, "_codex_incomplete_retries"): + agent._codex_incomplete_retries = 0 + + # Check for tool calls + if assistant_message.tool_calls: + if not agent.quiet_mode: + agent._vprint(f"{agent.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...") + + if agent.verbose_logging: + for tc in assistant_message.tool_calls: + logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...") + + # Validate tool call names - detect model hallucinations + # Repair mismatched tool names before validating + for tc in assistant_message.tool_calls: + if tc.function.name not in agent.valid_tool_names: + repaired = agent._repair_tool_call(tc.function.name) + if repaired: + print(f"{agent.log_prefix}🔧 Auto-repaired tool name: '{tc.function.name}' -> '{repaired}'") + tc.function.name = repaired + invalid_tool_calls = [ + tc.function.name for tc in assistant_message.tool_calls + if tc.function.name not in agent.valid_tool_names + ] + if invalid_tool_calls: + # Track retries for invalid tool calls + agent._invalid_tool_retries += 1 + + # Return helpful error to model — model can agent-correct next turn + available = ", ".join(sorted(agent.valid_tool_names)) + invalid_name = invalid_tool_calls[0] + invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name + agent._vprint(f"{agent.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for agent-correction ({agent._invalid_tool_retries}/3)") + + if agent._invalid_tool_retries >= 3: + agent._vprint(f"{agent.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True) + agent._invalid_tool_retries = 0 + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": f"Model generated invalid tool call: {invalid_preview}" + } + + assistant_msg = agent._build_assistant_message(assistant_message, finish_reason) + messages.append(assistant_msg) + for tc in assistant_message.tool_calls: + if tc.function.name not in agent.valid_tool_names: + content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}" + else: + content = "Skipped: another tool call in this turn used an invalid name. Please retry this tool call." + messages.append({ + "role": "tool", + "name": tc.function.name, + "tool_call_id": tc.id, + "content": content, + }) + continue + # Reset retry counter on successful tool call validation + agent._invalid_tool_retries = 0 + + # Validate tool call arguments are valid JSON + # Handle empty strings as empty objects (common model quirk) + invalid_json_args = [] + for tc in assistant_message.tool_calls: + args = tc.function.arguments + if isinstance(args, (dict, list)): + tc.function.arguments = json.dumps(args) + continue + if args is not None and not isinstance(args, str): + tc.function.arguments = str(args) + args = tc.function.arguments + # Treat empty/whitespace strings as empty object + if not args or not args.strip(): + tc.function.arguments = "{}" + continue + try: + json.loads(args) + except json.JSONDecodeError as e: + invalid_json_args.append((tc.function.name, str(e))) + + if invalid_json_args: + # Check if the invalid JSON is due to truncation rather + # than a model formatting mistake. Routers sometimes + # rewrite finish_reason from "length" to "tool_calls", + # hiding the truncation from the length handler above. + # Detect truncation: args that don't end with } or ] + # (after stripping whitespace) are cut off mid-stream. + _truncated = any( + not (tc.function.arguments or "").rstrip().endswith(("}", "]")) + for tc in assistant_message.tool_calls + if tc.function.name in {n for n, _ in invalid_json_args} + ) + if _truncated: + agent._vprint( + f"{agent.log_prefix}⚠️ Truncated tool call arguments detected " + f"(finish_reason={finish_reason!r}) — refusing to execute.", + force=True, + ) + agent._invalid_json_retries = 0 + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Response truncated due to output length limit", + } + + # Track retries for invalid JSON arguments + agent._invalid_json_retries += 1 + + tool_name, error_msg = invalid_json_args[0] + agent._vprint(f"{agent.log_prefix}⚠️ Invalid JSON in tool call arguments for '{tool_name}': {error_msg}") + + if agent._invalid_json_retries < 3: + agent._vprint(f"{agent.log_prefix}🔄 Retrying API call ({agent._invalid_json_retries}/3)...") + # Don't add anything to messages, just retry the API call + continue + else: + # Instead of returning partial, inject tool error results so the model can recover. + # Using tool results (not user messages) preserves role alternation. + agent._vprint(f"{agent.log_prefix}⚠️ Injecting recovery tool results for invalid JSON...") + agent._invalid_json_retries = 0 # Reset for next attempt + + # Append the assistant message with its (broken) tool_calls + recovery_assistant = agent._build_assistant_message(assistant_message, finish_reason) + messages.append(recovery_assistant) + + # Respond with tool error results for each tool call + invalid_names = {name for name, _ in invalid_json_args} + for tc in assistant_message.tool_calls: + if tc.function.name in invalid_names: + err = next(e for n, e in invalid_json_args if n == tc.function.name) + tool_result = ( + f"Error: Invalid JSON arguments. {err}. " + f"For tools with no required parameters, use an empty object: {{}}. " + f"Please retry with valid JSON." + ) + else: + tool_result = "Skipped: other tool call in this response had invalid JSON." + messages.append({ + "role": "tool", + "name": tc.function.name, + "tool_call_id": tc.id, + "content": tool_result, + }) + continue + + # Reset retry counter on successful JSON validation + agent._invalid_json_retries = 0 + + # ── Post-call guardrails ────────────────────────── + assistant_message.tool_calls = agent._cap_delegate_task_calls( + assistant_message.tool_calls + ) + assistant_message.tool_calls = agent._deduplicate_tool_calls( + assistant_message.tool_calls + ) + + assistant_msg = agent._build_assistant_message(assistant_message, finish_reason) + + # If this turn has both content AND tool_calls, capture the content + # as a fallback final response. Common pattern: model delivers its + # answer and calls memory/skill tools as a side-effect in the same + # turn. If the follow-up turn after tools is empty, we use this. + turn_content = assistant_message.content or "" + if turn_content and agent._has_content_after_think_block(turn_content): + agent._last_content_with_tools = turn_content + # Only mute subsequent output when EVERY tool call in + # this turn is post-response housekeeping (memory, todo, + # skill_manage, etc.). If any substantive tool is present + # (search_files, read_file, write_file, terminal, ...), + # keep output visible so the user sees progress. + _HOUSEKEEPING_TOOLS = frozenset({ + "memory", "todo", "skill_manage", "session_search", + }) + _all_housekeeping = all( + tc.function.name in _HOUSEKEEPING_TOOLS + for tc in assistant_message.tool_calls + ) + agent._last_content_tools_all_housekeeping = _all_housekeeping + if _all_housekeeping and agent._has_stream_consumers(): + agent._mute_post_response = True + elif agent._should_emit_quiet_tool_messages(): + clean = agent._strip_think_blocks(turn_content).strip() + if clean: + agent._vprint(f" ┊ 💬 {clean}") + + # Pop thinking-only prefill message(s) before appending + # (tool-call path — same rationale as the final-response path). + _had_prefill = False + while ( + messages + and isinstance(messages[-1], dict) + and messages[-1].get("_thinking_prefill") + ): + messages.pop() + _had_prefill = True + + # Reset prefill counter when tool calls follow a prefill + # recovery. Without this, the counter accumulates across + # the whole conversation — a model that intermittently + # empties (empty → prefill → tools → empty → prefill → + # tools) burns both prefill attempts and the third empty + # gets zero recovery. Resetting here treats each tool- + # call success as a fresh start. + if _had_prefill: + agent._thinking_prefill_retries = 0 + agent._empty_content_retries = 0 + # Successful tool execution — reset the post-tool nudge + # flag so it can fire again if the model goes empty on + # a LATER tool round. + agent._post_tool_empty_retried = False + + messages.append(assistant_msg) + agent._emit_interim_assistant_message(assistant_msg) + + # Close any open streaming display (response box, reasoning + # box) before tool execution begins. Intermediate turns may + # have streamed early content that opened the response box; + # flushing here prevents it from wrapping tool feed lines. + # Only signal the display callback — TTS (_stream_callback) + # should NOT receive None (it uses None as end-of-stream). + if agent.stream_delta_callback: + try: + agent.stream_delta_callback(None) + except Exception: + pass + + agent._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count) + + if agent._tool_guardrail_halt_decision is not None: + decision = agent._tool_guardrail_halt_decision + _turn_exit_reason = "guardrail_halt" + final_response = agent._toolguard_controlled_halt_response(decision) + agent._emit_status( + f"⚠️ Tool guardrail halted {decision.tool_name}: {decision.code}" + ) + messages.append({"role": "assistant", "content": final_response}) + break + + # Reset per-turn retry counters after successful tool + # execution so a single truncation doesn't poison the + # entire conversation. + truncated_tool_call_retries = 0 + + # Signal that a paragraph break is needed before the next + # streamed text. We don't emit it immediately because + # multiple consecutive tool iterations would stack up + # redundant blank lines. Instead, _fire_stream_delta() + # will prepend a single "\n\n" the next time real text + # arrives. + agent._stream_needs_break = True + + # Refund the iteration if the ONLY tool(s) called were + # execute_code (programmatic tool calling). These are + # cheap RPC-style calls that shouldn't eat the budget. + _tc_names = {tc.function.name for tc in assistant_message.tool_calls} + if _tc_names == {"execute_code"}: + agent.iteration_budget.refund() + + # Use real token counts from the API response to decide + # compression. prompt_tokens + completion_tokens is the + # actual context size the provider reported plus the + # assistant turn — a tight lower bound for the next prompt. + # Tool results appended above aren't counted yet, but the + # threshold (default 50%) leaves ample headroom; if tool + # results push past it, the next API call will report the + # real total and trigger compression then. + # + # If last_prompt_tokens is 0 (stale after API disconnect + # or provider returned no usage data), fall back to rough + # estimate to avoid missing compression. Without this, + # a session can grow unbounded after disconnects because + # should_compress(0) never fires. (#2153) + _compressor = agent.context_compressor + if _compressor.last_prompt_tokens > 0: + # Only use prompt_tokens — completion/reasoning + # tokens don't consume context window space. + # Thinking models (GLM-5.1, QwQ, DeepSeek R1) + # inflate completion_tokens with reasoning, + # causing premature compression. (#12026) + _real_tokens = _compressor.last_prompt_tokens + else: + # Include tool schemas — with 50+ tools enabled + # these add 20-30K tokens the messages-only + # estimate misses, which can skip compression + # past the configured threshold (#14695). + _real_tokens = estimate_request_tokens_rough( + messages, tools=agent.tools or None + ) + + if agent.compression_enabled and _compressor.should_compress(_real_tokens): + agent._safe_print(" ⟳ compacting context…") + messages, active_system_prompt = agent._compress_context( + messages, system_message, + approx_tokens=agent.context_compressor.last_prompt_tokens, + task_id=effective_task_id, + ) + # Compression created a new session — clear history so + # _flush_messages_to_session_db writes compressed messages + # to the new session (see preflight compression comment). + conversation_history = None + + # Save session log incrementally (so progress is visible even if interrupted) + agent._session_messages = messages + agent._save_session_log(messages) + + # Continue loop for next response + continue + + else: + # No tool calls - this is the final response + final_response = assistant_message.content or "" + + # Fix: unmute output when entering the no-tool-call branch + # so the user can see empty-response warnings and recovery + # status messages. _mute_post_response was set during a + # prior housekeeping tool turn and should not silence the + # final response path. + agent._mute_post_response = False + + # Check if response only has think block with no actual content after it + if not agent._has_content_after_think_block(final_response): + # ── Partial stream recovery ───────────────────── + # If content was already streamed to the user before + # the connection died, use it as the final response + # instead of falling through to prior-turn fallback + # or wasting API calls on retries. + _partial_streamed = ( + getattr(agent, "_current_streamed_assistant_text", "") or "" + ) + if agent._has_content_after_think_block(_partial_streamed): + _turn_exit_reason = "partial_stream_recovery" + _recovered = agent._strip_think_blocks(_partial_streamed).strip() + logger.info( + "Partial stream content delivered (%d chars) " + "— using as final response", + len(_recovered), + ) + agent._emit_status( + "↻ Stream interrupted — using delivered content " + "as final response" + ) + final_response = _recovered + agent._response_was_previewed = True + break + + # If the previous turn already delivered real content alongside + # HOUSEKEEPING tool calls (e.g. "You're welcome!" + memory save), + # the model has nothing more to say. Use the earlier content + # immediately instead of wasting API calls on retries. + # NOTE: Only use this shortcut when ALL tools in that turn were + # housekeeping (memory, todo, etc.). When substantive tools + # were called (terminal, search_files, etc.), the content was + # likely mid-task narration ("I'll scan the directory...") and + # the empty follow-up means the model choked — let the + # post-tool nudge below handle that instead of exiting early. + fallback = getattr(agent, '_last_content_with_tools', None) + if fallback and getattr(agent, '_last_content_tools_all_housekeeping', False): + _turn_exit_reason = "fallback_prior_turn_content" + logger.info("Empty follow-up after tool calls — using prior turn content as final response") + agent._emit_status("↻ Empty response after tool calls — using earlier content as final answer") + agent._last_content_with_tools = None + agent._last_content_tools_all_housekeeping = False + agent._empty_content_retries = 0 + # Do NOT modify the assistant message content — the + # old code injected "Calling the X tools..." which + # poisoned the conversation history. Just use the + # fallback text as the final response and break. + final_response = agent._strip_think_blocks(fallback).strip() + agent._response_was_previewed = True + break + + # ── Post-tool-call empty response nudge ─────────── + # The model returned empty after executing tool calls. + # This covers two cases: + # (a) No prior-turn content at all — model went silent + # (b) Prior turn had content + SUBSTANTIVE tools (the + # fallback above was skipped because the content + # was mid-task narration, not a final answer) + # Instead of giving up, nudge the model to continue by + # appending a user-level hint. This is the #9400 case: + # weaker models (mimo-v2-pro, GLM-5, etc.) sometimes + # return empty after tool results instead of continuing + # to the next step. One retry with a nudge usually + # fixes it. + _prior_was_tool = any( + m.get("role") == "tool" + for m in messages[-5:] # check recent messages + ) + # Detect Qwen3/Ollama-style in-content thinking blocks. + # Ollama puts in the content field (not in + # reasoning_content), so _has_structured below would + # miss it. We check here so thinking-only responses + # after tool calls route to prefill instead of nudge. + _has_inline_thinking = bool( + re.search( + r'||', + final_response or "", + re.IGNORECASE, + ) + ) + if ( + _prior_was_tool + and not getattr(agent, "_post_tool_empty_retried", False) + and not _has_inline_thinking # thinking model still working — let prefill handle + ): + agent._post_tool_empty_retried = True + # Clear stale narration so it doesn't resurface + # on a later empty response after the nudge. + agent._last_content_with_tools = None + agent._last_content_tools_all_housekeeping = False + logger.info( + "Empty response after tool calls — nudging model " + "to continue processing" + ) + agent._emit_status( + "⚠️ Model returned empty after tool calls — " + "nudging to continue" + ) + # Append the empty assistant message first so the + # message sequence stays valid: + # tool(result) → assistant("(empty)") → user(nudge) + # Without this, we'd have tool → user which most + # APIs reject as an invalid sequence. + _nudge_msg = agent._build_assistant_message(assistant_message, finish_reason) + _nudge_msg["content"] = "(empty)" + _nudge_msg["_empty_recovery_synthetic"] = True + messages.append(_nudge_msg) + messages.append({ + "role": "user", + "content": ( + "You just executed tool calls but returned an " + "empty response. Please process the tool " + "results above and continue with the task." + ), + "_empty_recovery_synthetic": True, + }) + continue + + # ── Thinking-only prefill continuation ────────── + # The model produced structured reasoning (via API + # fields) but no visible text content. Rather than + # giving up, append the assistant message as-is and + # continue — the model will see its own reasoning + # on the next turn and produce the text portion. + # Inspired by clawdbot's "incomplete-text" recovery. + # Also covers Qwen3/Ollama in-content blocks + # (detected above as _has_inline_thinking). + _has_structured = bool( + getattr(assistant_message, "reasoning", None) + or getattr(assistant_message, "reasoning_content", None) + or getattr(assistant_message, "reasoning_details", None) + or _has_inline_thinking + ) + if _has_structured and agent._thinking_prefill_retries < 2: + agent._thinking_prefill_retries += 1 + logger.info( + "Thinking-only response (no visible content) — " + "prefilling to continue (%d/2)", + agent._thinking_prefill_retries, + ) + agent._emit_status( + f"↻ Thinking-only response — prefilling to continue " + f"({agent._thinking_prefill_retries}/2)" + ) + interim_msg = agent._build_assistant_message( + assistant_message, "incomplete" + ) + interim_msg["_thinking_prefill"] = True + messages.append(interim_msg) + agent._session_messages = messages + agent._save_session_log(messages) + continue + + # ── Empty response retry ────────────────────── + # Model returned nothing usable. Retry up to 3 + # times before attempting fallback. This covers + # both truly empty responses (no content, no + # reasoning) AND reasoning-only responses after + # prefill exhaustion — models like mimo-v2-pro + # always populate reasoning fields via OpenRouter, + # so the old `not _has_structured` guard blocked + # retries for every reasoning model after prefill. + _truly_empty = not agent._strip_think_blocks( + final_response + ).strip() + _prefill_exhausted = ( + _has_structured + and agent._thinking_prefill_retries >= 2 + ) + if _truly_empty and (not _has_structured or _prefill_exhausted) and agent._empty_content_retries < 3: + agent._empty_content_retries += 1 + logger.warning( + "Empty response (no content or reasoning) — " + "retry %d/3 (model=%s)", + agent._empty_content_retries, agent.model, + ) + agent._emit_status( + f"⚠️ Empty response from model — retrying " + f"({agent._empty_content_retries}/3)" + ) + continue + + # ── Exhausted retries — try fallback provider ── + # Before giving up with "(empty)", attempt to + # switch to the next provider in the fallback + # chain. This covers the case where a model + # (e.g. GLM-4.5-Air) consistently returns empty + # due to context degradation or provider issues. + if _truly_empty and agent._fallback_chain: + logger.warning( + "Empty response after %d retries — " + "attempting fallback (model=%s, provider=%s)", + agent._empty_content_retries, agent.model, + agent.provider, + ) + agent._emit_status( + "⚠️ Model returning empty responses — " + "switching to fallback provider..." + ) + if agent._try_activate_fallback(): + agent._empty_content_retries = 0 + agent._emit_status( + f"↻ Switched to fallback: {agent.model} " + f"({agent.provider})" + ) + logger.info( + "Fallback activated after empty responses: " + "now using %s on %s", + agent.model, agent.provider, + ) + continue + + # Exhausted retries and fallback chain (or no + # fallback configured). Fall through to the + # "(empty)" terminal. + _turn_exit_reason = "empty_response_exhausted" + reasoning_text = agent._extract_reasoning(assistant_message) + agent._drop_trailing_empty_response_scaffolding(messages) + assistant_msg = agent._build_assistant_message(assistant_message, finish_reason) + assistant_msg["content"] = "(empty)" + # This is a user-facing failure sentinel for the gateway, + # not real assistant content. Persisting it makes later + # "continue" turns replay assistant("(empty)") as if it + # were a meaningful model response, which can keep long + # tool-heavy sessions stuck in empty-response loops. + assistant_msg["_empty_terminal_sentinel"] = True + messages.append(assistant_msg) + + if reasoning_text: + reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text + logger.warning( + "Reasoning-only response (no visible content) " + "after exhausting retries and fallback. " + "Reasoning: %s", reasoning_preview, + ) + agent._emit_status( + "⚠️ Model produced reasoning but no visible " + "response after all retries. Returning empty." + ) + else: + logger.warning( + "Empty response (no content or reasoning) " + "after %d retries. No fallback available. " + "model=%s provider=%s", + agent._empty_content_retries, agent.model, + agent.provider, + ) + agent._emit_status( + "❌ Model returned no content after all retries" + + (" and fallback attempts." if agent._fallback_chain else + ". No fallback providers configured.") + ) + + final_response = "(empty)" + break + + # Reset retry counter/signature on successful content + agent._empty_content_retries = 0 + agent._thinking_prefill_retries = 0 + + if ( + agent.api_mode == "codex_responses" + and agent.valid_tool_names + and codex_ack_continuations < 2 + and agent._looks_like_codex_intermediate_ack( + user_message=user_message, + assistant_content=final_response, + messages=messages, + ) + ): + codex_ack_continuations += 1 + interim_msg = agent._build_assistant_message(assistant_message, "incomplete") + messages.append(interim_msg) + agent._emit_interim_assistant_message(interim_msg) + + continue_msg = { + "role": "user", + "content": ( + "[System: Continue now. Execute the required tool calls and only " + "send your final answer after completing the task.]" + ), + } + messages.append(continue_msg) + agent._session_messages = messages + agent._save_session_log(messages) + continue + + codex_ack_continuations = 0 + + if truncated_response_parts: + final_response = "".join(truncated_response_parts) + final_response + truncated_response_parts = [] + length_continue_retries = 0 + + final_response = agent._strip_think_blocks(final_response).strip() + + final_msg = agent._build_assistant_message(assistant_message, finish_reason) + + # Pop thinking-only prefill and empty-response retry + # scaffolding before appending the final response. These + # internal turns are only for the next API retry and should + # not become durable transcript context. + while ( + messages + and isinstance(messages[-1], dict) + and ( + messages[-1].get("_thinking_prefill") + or messages[-1].get("_empty_recovery_synthetic") + or messages[-1].get("_empty_terminal_sentinel") + ) + ): + messages.pop() + + messages.append(final_msg) + + _turn_exit_reason = f"text_response(finish_reason={finish_reason})" + if not agent.quiet_mode: + agent._safe_print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)") + break + + except Exception as e: + error_msg = f"Error during OpenAI-compatible API call #{api_call_count}: {str(e)}" + try: + print(f"❌ {error_msg}") + except (OSError, ValueError): + logger.error(error_msg) + + logger.debug("Outer loop error in API call #%d", api_call_count, exc_info=True) + + # If an assistant message with tool_calls was already appended, + # the API expects a role="tool" result for every tool_call_id. + # Fill in error results for any that weren't answered yet. + for idx in range(len(messages) - 1, -1, -1): + msg = messages[idx] + if not isinstance(msg, dict): + break + if msg.get("role") == "tool": + continue + if msg.get("role") == "assistant" and msg.get("tool_calls"): + answered_ids = { + m["tool_call_id"] + for m in messages[idx + 1:] + if isinstance(m, dict) and m.get("role") == "tool" + } + for tc in msg["tool_calls"]: + if not tc or not isinstance(tc, dict): continue + if tc["id"] not in answered_ids: + err_msg = { + "role": "tool", + "name": _ra().AIAgent._get_tool_call_name_static(tc), + "tool_call_id": tc["id"], + "content": f"Error executing tool: {error_msg}", + } + messages.append(err_msg) + break + + # Non-tool errors don't need a synthetic message injected. + # The error is already printed to the user (line above), and + # the retry loop continues. Injecting a fake user/assistant + # message pollutes history, burns tokens, and risks violating + # role-alternation invariants. + + # If we're near the limit, break to avoid infinite loops + if api_call_count >= agent.max_iterations - 1: + _turn_exit_reason = f"error_near_max_iterations({error_msg[:80]})" + final_response = f"I apologize, but I encountered repeated errors: {error_msg}" + # Append as assistant so the history stays valid for + # session resume (avoids consecutive user messages). + messages.append({"role": "assistant", "content": final_response}) + break + + if final_response is None and ( + api_call_count >= agent.max_iterations + or agent.iteration_budget.remaining <= 0 + ): + # Budget exhausted — ask the model for a summary via one extra + # API call with tools stripped. _handle_max_iterations injects a + # user message and makes a single toolless request. + _turn_exit_reason = f"max_iterations_reached({api_call_count}/{agent.max_iterations})" + agent._emit_status( + f"⚠️ Iteration budget exhausted ({api_call_count}/{agent.max_iterations}) " + "— asking model to summarise" + ) + if not agent.quiet_mode: + agent._safe_print( + f"\n⚠️ Iteration budget exhausted ({api_call_count}/{agent.max_iterations}) " + "— requesting summary..." + ) + final_response = agent._handle_max_iterations(messages, api_call_count) + + # If running as a kanban worker, block the task so the dispatcher + # knows the worker could not complete (rather than treating it as a + # protocol violation). The agent loop strips tools before calling + # _handle_max_iterations, so the model cannot call kanban_block + # itself — we must do it on its behalf. + _kanban_task = os.environ.get("HERMES_KANBAN_TASK") + if _kanban_task: + try: + _ra().handle_function_call( + "kanban_block", + { + "task_id": _kanban_task, + "reason": ( + f"Iteration budget exhausted " + f"({api_call_count}/{agent.max_iterations}) — " + "task could not complete within the allowed " + "iterations" + ), + }, + task_id=effective_task_id, + ) + logger.info( + "kanban_block called for task %s after iteration " + "exhaustion (%d/%d)", + _kanban_task, api_call_count, agent.max_iterations, + ) + except Exception: + logger.warning( + "Failed to call kanban_block after iteration " + "exhaustion for task %s", + _kanban_task, + exc_info=True, + ) + + # Determine if conversation completed successfully + completed = final_response is not None and api_call_count < agent.max_iterations + + # Save trajectory if enabled. ``user_message`` may be a multimodal + # list of parts; the trajectory format wants a plain string. + agent._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed) + + # Clean up VM and browser for this task after conversation completes + agent._cleanup_task_resources(effective_task_id) + + # Persist session to both JSON log and SQLite only after private retry + # scaffolding has been removed. Otherwise a later user "continue" turn + # can replay assistant("(empty)") / recovery nudges and fall into the + # same empty-response loop again. + agent._drop_trailing_empty_response_scaffolding(messages) + agent._persist_session(messages, conversation_history) + + # ── Turn-exit diagnostic log ───────────────────────────────────── + # Always logged at INFO so agent.log captures WHY every turn ended. + # When the last message is a tool result (agent was mid-work), log + # at WARNING — this is the "just stops" scenario users report. + _last_msg_role = messages[-1].get("role") if messages else None + _last_tool_name = None + if _last_msg_role == "tool": + # Walk back to find the assistant message with the tool call + for _m in reversed(messages): + if _m.get("role") == "assistant" and _m.get("tool_calls"): + _tcs = _m["tool_calls"] + if _tcs and isinstance(_tcs[0], dict): + _last_tool_name = _tcs[-1].get("function", {}).get("name") + break + + _turn_tool_count = sum( + 1 for m in messages + if isinstance(m, dict) and m.get("role") == "assistant" and m.get("tool_calls") + ) + _resp_len = len(final_response) if final_response else 0 + _budget_used = agent.iteration_budget.used if agent.iteration_budget else 0 + _budget_max = agent.iteration_budget.max_total if agent.iteration_budget else 0 + + _diag_msg = ( + "Turn ended: reason=%s model=%s api_calls=%d/%d budget=%d/%d " + "tool_turns=%d last_msg_role=%s response_len=%d session=%s" + ) + _diag_args = ( + _turn_exit_reason, agent.model, api_call_count, agent.max_iterations, + _budget_used, _budget_max, + _turn_tool_count, _last_msg_role, _resp_len, + agent.session_id or "none", + ) + + if _last_msg_role == "tool" and not interrupted: + # Agent was mid-work — this is the "just stops" case. + logger.warning( + "Turn ended with pending tool result (agent may appear stuck). " + + _diag_msg + " last_tool=%s", + *_diag_args, _last_tool_name, + ) + else: + logger.info(_diag_msg, *_diag_args) + + # File-mutation verifier footer. + # If one or more ``write_file`` / ``patch`` calls failed during this + # turn and were never superseded by a successful write to the same + # path, append an advisory footer to the assistant response. This + # catches the specific case — reported by Ben Eng (#15524-adjacent) + # — where a model issues a batch of parallel patches, half of them + # fail with "Could not find old_string", and the model summarises + # the turn claiming every file was edited. The user then has to + # manually run ``git status`` to catch the lie. With this footer + # the truth is surfaced on every turn, so over-claiming is + # structurally impossible past the model. + # + # Gate: only applied when a real text response exists for this + # turn and the user didn't interrupt. Empty/interrupted turns + # already have other surface text that shouldn't be augmented. + if final_response and not interrupted: + try: + _failed = getattr(agent, "_turn_failed_file_mutations", None) or {} + if _failed and agent._file_mutation_verifier_enabled(): + footer = agent._format_file_mutation_failure_footer(_failed) + if footer: + final_response = final_response.rstrip() + "\n\n" + footer + except Exception as _ver_err: + logger.debug("file-mutation verifier footer failed: %s", _ver_err) + + # Plugin hook: transform_llm_output + # Fired once per turn after the tool-calling loop completes. + # Plugins can transform the LLM's output text before it's returned. + # First hook to return a string wins; None/empty return leaves text unchanged. + if final_response and not interrupted: + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _transform_results = _invoke_hook( + "transform_llm_output", + response_text=final_response, + session_id=agent.session_id or "", + model=agent.model, + platform=getattr(agent, "platform", None) or "", + ) + for _hook_result in _transform_results: + if isinstance(_hook_result, str) and _hook_result: + final_response = _hook_result + break # First non-empty string wins + except Exception as exc: + logger.warning("transform_llm_output hook failed: %s", exc) + + # Plugin hook: post_llm_call + # Fired once per turn after the tool-calling loop completes. + # Plugins can use this to persist conversation data (e.g. sync + # to an external memory system). + if final_response and not interrupted: + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _invoke_hook( + "post_llm_call", + session_id=agent.session_id, + user_message=original_user_message, + assistant_response=final_response, + conversation_history=list(messages), + model=agent.model, + platform=getattr(agent, "platform", None) or "", + ) + except Exception as exc: + logger.warning("post_llm_call hook failed: %s", exc) + + # Extract reasoning from the CURRENT turn only. Walk backwards + # but stop at the user message that started this turn — anything + # earlier is from a prior turn and must not leak into the reasoning + # box (confusing stale display; #17055). Within the current turn + # we still want the *most recent* non-empty reasoning: many + # providers (Claude thinking, DeepSeek v4, Codex Responses) emit + # reasoning on the tool-call step and leave the final-answer step + # with reasoning=None, so picking only the last assistant would + # silently drop legitimate same-turn reasoning. + last_reasoning = None + for msg in reversed(messages): + if msg.get("role") == "user": + break # turn boundary — don't cross into prior turns + if msg.get("role") == "assistant" and msg.get("reasoning"): + last_reasoning = msg["reasoning"] + break + + # Build result with interrupt info if applicable + result = { + "final_response": final_response, + "last_reasoning": last_reasoning, + "messages": messages, + "api_calls": api_call_count, + "completed": completed, + "turn_exit_reason": _turn_exit_reason, + "partial": False, # True only when stopped due to invalid tool calls + "interrupted": interrupted, + "response_previewed": getattr(agent, "_response_was_previewed", False), + "model": agent.model, + "provider": agent.provider, + "base_url": agent.base_url, + "input_tokens": agent.session_input_tokens, + "output_tokens": agent.session_output_tokens, + "cache_read_tokens": agent.session_cache_read_tokens, + "cache_write_tokens": agent.session_cache_write_tokens, + "reasoning_tokens": agent.session_reasoning_tokens, + "prompt_tokens": agent.session_prompt_tokens, + "completion_tokens": agent.session_completion_tokens, + "total_tokens": agent.session_total_tokens, + "last_prompt_tokens": getattr(agent.context_compressor, "last_prompt_tokens", 0) or 0, + "estimated_cost_usd": agent.session_estimated_cost_usd, + "cost_status": agent.session_cost_status, + "cost_source": agent.session_cost_source, + } + if agent._tool_guardrail_halt_decision is not None: + result["guardrail"] = agent._tool_guardrail_halt_decision.to_metadata() + # If a /steer landed after the final assistant turn (no more tool + # batches to drain into), hand it back to the caller so it can be + # delivered as the next user turn instead of being silently lost. + _leftover_steer = agent._drain_pending_steer() + if _leftover_steer: + result["pending_steer"] = _leftover_steer + agent._response_was_previewed = False + + # Include interrupt message if one triggered the interrupt + if interrupted and agent._interrupt_message: + result["interrupt_message"] = agent._interrupt_message + + # Clear interrupt state after handling + agent.clear_interrupt() + + # Clear stream callback so it doesn't leak into future calls + agent._stream_callback = None + + # Check skill trigger NOW — based on how many tool iterations THIS turn used. + _should_review_skills = False + if (agent._skill_nudge_interval > 0 + and agent._iters_since_skill >= agent._skill_nudge_interval + and "skill_manage" in agent.valid_tool_names): + _should_review_skills = True + agent._iters_since_skill = 0 + + # External memory provider: sync the completed turn + queue next prefetch. + agent._sync_external_memory_for_turn( + original_user_message=original_user_message, + final_response=final_response, + interrupted=interrupted, + ) + + # Background memory/skill review — runs AFTER the response is delivered + # so it never competes with the user's task for model attention. + if final_response and not interrupted and (_should_review_memory or _should_review_skills): + try: + agent._spawn_background_review( + messages_snapshot=list(messages), + review_memory=_should_review_memory, + review_skills=_should_review_skills, + ) + except Exception: + pass # Background review is best-effort + + # Note: Memory provider on_session_end() + shutdown_all() are NOT + # called here — run_conversation() is called once per user message in + # multi-turn sessions. Shutting down after every turn would kill the + # provider before the second message. Actual session-end cleanup is + # handled by the CLI (atexit / /reset) and gateway (session expiry / + # _reset_session). + + # Plugin hook: on_session_end + # Fired at the very end of every run_conversation call. + # Plugins can use this for cleanup, flushing buffers, etc. + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _invoke_hook( + "on_session_end", + session_id=agent.session_id, + completed=completed, + interrupted=interrupted, + model=agent.model, + platform=getattr(agent, "platform", None) or "", + ) + except Exception as exc: + logger.warning("on_session_end hook failed: %s", exc) + + return result + + + +__all__ = ["run_conversation"] diff --git a/agent/iteration_budget.py b/agent/iteration_budget.py new file mode 100644 index 00000000000..213b97c0226 --- /dev/null +++ b/agent/iteration_budget.py @@ -0,0 +1,62 @@ +"""Per-agent iteration budget — thread-safe consume/refund counter. + +Extracted from ``run_agent.py``. Each ``AIAgent`` instance (parent or +subagent) holds an :class:`IterationBudget`; the parent's cap comes from +``max_iterations`` (default 90), each subagent's cap comes from +``delegation.max_iterations`` (default 50). + +``run_agent`` re-exports ``IterationBudget`` so existing +``from run_agent import IterationBudget`` imports keep working unchanged. +""" + +from __future__ import annotations + +import threading + + +class IterationBudget: + """Thread-safe iteration counter for an agent. + + Each agent (parent or subagent) gets its own ``IterationBudget``. + The parent's budget is capped at ``max_iterations`` (default 90). + Each subagent gets an independent budget capped at + ``delegation.max_iterations`` (default 50) — this means total + iterations across parent + subagents can exceed the parent's cap. + Users control the per-subagent limit via ``delegation.max_iterations`` + in config.yaml. + + ``execute_code`` (programmatic tool calling) iterations are refunded via + :meth:`refund` so they don't eat into the budget. + """ + + def __init__(self, max_total: int): + self.max_total = max_total + self._used = 0 + self._lock = threading.Lock() + + def consume(self) -> bool: + """Try to consume one iteration. Returns True if allowed.""" + with self._lock: + if self._used >= self.max_total: + return False + self._used += 1 + return True + + def refund(self) -> None: + """Give back one iteration (e.g. for execute_code turns).""" + with self._lock: + if self._used > 0: + self._used -= 1 + + @property + def used(self) -> int: + with self._lock: + return self._used + + @property + def remaining(self) -> int: + with self._lock: + return max(0, self.max_total - self._used) + + +__all__ = ["IterationBudget"] diff --git a/agent/message_sanitization.py b/agent/message_sanitization.py new file mode 100644 index 00000000000..ff53d247a84 --- /dev/null +++ b/agent/message_sanitization.py @@ -0,0 +1,444 @@ +"""Message and tool-payload sanitization helpers. + +Pure functions extracted from ``run_agent.py`` so the AIAgent module can +stay focused on the conversation loop. These walk OpenAI-format message +lists and structured payloads, repairing or stripping problematic +characters that would otherwise crash ``json.dumps`` inside the OpenAI +SDK or be rejected by upstream APIs. + +All helpers are stateless and side-effect-free except for in-place +mutation of their input (where documented). Backward-compatible +re-exports from ``run_agent`` remain in place so existing imports +``from run_agent import _sanitize_surrogates`` keep working. +""" + +from __future__ import annotations + +import json +import logging +import re +from typing import Any + +logger = logging.getLogger(__name__) + +# Lone surrogate code points are invalid in UTF-8 and crash json.dumps +# inside the OpenAI SDK. Used by every surrogate-sanitization helper +# below as well as by run_agent and the CLI for paste-from-clipboard +# scrubbing. +_SURROGATE_RE = re.compile(r'[\ud800-\udfff]') + + +def _sanitize_surrogates(text: str) -> str: + """Replace lone surrogate code points with U+FFFD (replacement character). + + Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the + OpenAI SDK. This is a fast no-op when the text contains no surrogates. + """ + if _SURROGATE_RE.search(text): + return _SURROGATE_RE.sub('\ufffd', text) + return text + + +def _sanitize_structure_surrogates(payload: Any) -> bool: + """Replace surrogate code points in nested dict/list payloads in-place. + + Mirror of ``_sanitize_structure_non_ascii`` but for surrogate recovery. + Used to scrub nested structured fields (e.g. ``reasoning_details`` — an + array of dicts with ``summary``/``text`` strings) that flat per-field + checks don't reach. Returns True if any surrogates were replaced. + """ + found = False + + def _walk(node): + nonlocal found + if isinstance(node, dict): + for key, value in node.items(): + if isinstance(value, str): + if _SURROGATE_RE.search(value): + node[key] = _SURROGATE_RE.sub('\ufffd', value) + found = True + elif isinstance(value, (dict, list)): + _walk(value) + elif isinstance(node, list): + for idx, value in enumerate(node): + if isinstance(value, str): + if _SURROGATE_RE.search(value): + node[idx] = _SURROGATE_RE.sub('\ufffd', value) + found = True + elif isinstance(value, (dict, list)): + _walk(value) + + _walk(payload) + return found + + +def _sanitize_messages_surrogates(messages: list) -> bool: + """Sanitize surrogate characters from all string content in a messages list. + + Walks message dicts in-place. Returns True if any surrogates were found + and replaced, False otherwise. Covers content/text, name, tool call + metadata/arguments, AND any additional string or nested structured fields + (``reasoning``, ``reasoning_content``, ``reasoning_details``, etc.) so + retries don't fail on a non-content field. Byte-level reasoning models + (xiaomi/mimo, kimi, glm) can emit lone surrogates in reasoning output + that flow through to ``api_messages["reasoning_content"]`` on the next + turn and crash json.dumps inside the OpenAI SDK. + """ + found = False + for msg in messages: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if isinstance(content, str) and _SURROGATE_RE.search(content): + msg["content"] = _SURROGATE_RE.sub('\ufffd', content) + found = True + elif isinstance(content, list): + for part in content: + if isinstance(part, dict): + text = part.get("text") + if isinstance(text, str) and _SURROGATE_RE.search(text): + part["text"] = _SURROGATE_RE.sub('\ufffd', text) + found = True + name = msg.get("name") + if isinstance(name, str) and _SURROGATE_RE.search(name): + msg["name"] = _SURROGATE_RE.sub('\ufffd', name) + found = True + tool_calls = msg.get("tool_calls") + if isinstance(tool_calls, list): + for tc in tool_calls: + if not isinstance(tc, dict): + continue + tc_id = tc.get("id") + if isinstance(tc_id, str) and _SURROGATE_RE.search(tc_id): + tc["id"] = _SURROGATE_RE.sub('\ufffd', tc_id) + found = True + fn = tc.get("function") + if isinstance(fn, dict): + fn_name = fn.get("name") + if isinstance(fn_name, str) and _SURROGATE_RE.search(fn_name): + fn["name"] = _SURROGATE_RE.sub('\ufffd', fn_name) + found = True + fn_args = fn.get("arguments") + if isinstance(fn_args, str) and _SURROGATE_RE.search(fn_args): + fn["arguments"] = _SURROGATE_RE.sub('\ufffd', fn_args) + found = True + # Walk any additional string / nested fields (reasoning, + # reasoning_content, reasoning_details, etc.) — surrogates from + # byte-level reasoning models (xiaomi/mimo, kimi, glm) can lurk + # in these fields and aren't covered by the per-field checks above. + # Matches _sanitize_messages_non_ascii's coverage (PR #10537). + for key, value in msg.items(): + if key in {"content", "name", "tool_calls", "role"}: + continue + if isinstance(value, str): + if _SURROGATE_RE.search(value): + msg[key] = _SURROGATE_RE.sub('\ufffd', value) + found = True + elif isinstance(value, (dict, list)): + if _sanitize_structure_surrogates(value): + found = True + return found + + +def _escape_invalid_chars_in_json_strings(raw: str) -> str: + """Escape unescaped control chars inside JSON string values. + + Walks the raw JSON character-by-character, tracking whether we are + inside a double-quoted string. Inside strings, replaces literal + control characters (0x00-0x1F) that aren't already part of an escape + sequence with their ``\\uXXXX`` equivalents. Pass-through for everything + else. + + Ported from #12093 — complements the other repair passes in + ``_repair_tool_call_arguments`` when ``json.loads(strict=False)`` is + not enough (e.g. llama.cpp backends that emit literal apostrophes or + tabs alongside other malformations). + """ + out: list[str] = [] + in_string = False + i = 0 + n = len(raw) + while i < n: + ch = raw[i] + if in_string: + if ch == "\\" and i + 1 < n: + # Already-escaped char — pass through as-is + out.append(ch) + out.append(raw[i + 1]) + i += 2 + continue + if ch == '"': + in_string = False + out.append(ch) + elif ord(ch) < 0x20: + out.append(f"\\u{ord(ch):04x}") + else: + out.append(ch) + else: + if ch == '"': + in_string = True + out.append(ch) + i += 1 + return "".join(out) + + +def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str: + """Attempt to repair malformed tool_call argument JSON. + + Models like GLM-5.1 via Ollama can produce truncated JSON, trailing + commas, Python ``None``, etc. The API proxy rejects these with HTTP 400 + "invalid tool call arguments". This function applies common repairs; + if all fail it returns ``"{}"`` so the request succeeds (better than + crashing the session). All repairs are logged at WARNING level. + """ + raw_stripped = raw_args.strip() if isinstance(raw_args, str) else "" + + # Fast-path: empty / whitespace-only -> empty object + if not raw_stripped: + logger.warning("Sanitized empty tool_call arguments for %s", tool_name) + return "{}" + + # Python-literal None -> normalise to {} + if raw_stripped == "None": + logger.warning("Sanitized Python-None tool_call arguments for %s", tool_name) + return "{}" + + # Repair pass 0: llama.cpp backends sometimes emit literal control + # characters (tabs, newlines) inside JSON string values. json.loads + # with strict=False accepts these and lets us re-serialise the + # result into wire-valid JSON without any string surgery. This is + # the most common local-model repair case (#12068). + try: + parsed = json.loads(raw_stripped, strict=False) + reserialised = json.dumps(parsed, separators=(",", ":")) + if reserialised != raw_stripped: + logger.warning( + "Repaired unescaped control chars in tool_call arguments for %s", + tool_name, + ) + return reserialised + except (json.JSONDecodeError, TypeError, ValueError): + pass + + # Attempt common JSON repairs + fixed = raw_stripped + # 1. Strip trailing commas before } or ] + fixed = re.sub(r',\s*([}\]])', r'\1', fixed) + # 2. Close unclosed structures + open_curly = fixed.count('{') - fixed.count('}') + open_bracket = fixed.count('[') - fixed.count(']') + if open_curly > 0: + fixed += '}' * open_curly + if open_bracket > 0: + fixed += ']' * open_bracket + # 3. Remove excess closing braces/brackets (bounded to 50 iterations) + for _ in range(50): + try: + json.loads(fixed) + break + except json.JSONDecodeError: + if fixed.endswith('}') and fixed.count('}') > fixed.count('{'): + fixed = fixed[:-1] + elif fixed.endswith(']') and fixed.count(']') > fixed.count('['): + fixed = fixed[:-1] + else: + break + + try: + json.loads(fixed) + logger.warning( + "Repaired malformed tool_call arguments for %s: %s → %s", + tool_name, raw_stripped[:80], fixed[:80], + ) + return fixed + except json.JSONDecodeError: + pass + + # Repair pass 4: escape unescaped control chars inside JSON strings, + # then retry. Catches cases where strict=False alone fails because + # other malformations are present too. + try: + escaped = _escape_invalid_chars_in_json_strings(fixed) + if escaped != fixed: + json.loads(escaped) + logger.warning( + "Repaired control-char-laced tool_call arguments for %s: %s → %s", + tool_name, raw_stripped[:80], escaped[:80], + ) + return escaped + except (json.JSONDecodeError, TypeError, ValueError): + pass + + # Last resort: replace with empty object so the API request doesn't + # crash the entire session. + logger.warning( + "Unrepairable tool_call arguments for %s — " + "replaced with empty object (was: %s)", + tool_name, raw_stripped[:80], + ) + return "{}" + + +def _strip_non_ascii(text: str) -> str: + """Remove non-ASCII characters, replacing with closest ASCII equivalent or removing. + + Used as a last resort when the system encoding is ASCII and can't handle + any non-ASCII characters (e.g. LANG=C on Chromebooks). + """ + return text.encode('ascii', errors='ignore').decode('ascii') + + +def _sanitize_messages_non_ascii(messages: list) -> bool: + """Strip non-ASCII characters from all string content in a messages list. + + This is a last-resort recovery for systems with ASCII-only encoding + (LANG=C, Chromebooks, minimal containers). Returns True if any + non-ASCII content was found and sanitized. + """ + found = False + for msg in messages: + if not isinstance(msg, dict): + continue + # Sanitize content (string) + content = msg.get("content") + if isinstance(content, str): + sanitized = _strip_non_ascii(content) + if sanitized != content: + msg["content"] = sanitized + found = True + elif isinstance(content, list): + for part in content: + if isinstance(part, dict): + text = part.get("text") + if isinstance(text, str): + sanitized = _strip_non_ascii(text) + if sanitized != text: + part["text"] = sanitized + found = True + # Sanitize name field (can contain non-ASCII in tool results) + name = msg.get("name") + if isinstance(name, str): + sanitized = _strip_non_ascii(name) + if sanitized != name: + msg["name"] = sanitized + found = True + # Sanitize tool_calls + tool_calls = msg.get("tool_calls") + if isinstance(tool_calls, list): + for tc in tool_calls: + if isinstance(tc, dict): + fn = tc.get("function", {}) + if isinstance(fn, dict): + fn_args = fn.get("arguments") + if isinstance(fn_args, str): + sanitized = _strip_non_ascii(fn_args) + if sanitized != fn_args: + fn["arguments"] = sanitized + found = True + # Sanitize any additional top-level string fields (e.g. reasoning_content) + for key, value in msg.items(): + if key in {"content", "name", "tool_calls", "role"}: + continue + if isinstance(value, str): + sanitized = _strip_non_ascii(value) + if sanitized != value: + msg[key] = sanitized + found = True + return found + + +def _sanitize_tools_non_ascii(tools: list) -> bool: + """Strip non-ASCII characters from tool payloads in-place.""" + return _sanitize_structure_non_ascii(tools) + + +def _strip_images_from_messages(messages: list) -> bool: + """Remove image_url content parts from all messages in-place. + + Called when a server signals it does not support images (e.g. + "Only 'text' content type is supported."). Mutates messages so the + next API call sends text only. + + Preserves message alternation invariants: + * ``tool``-role messages whose content was entirely images are replaced + with a plaintext placeholder, NOT deleted — deleting them would leave + the paired ``tool_call_id`` on the prior assistant message unmatched, + which providers reject with HTTP 400. + * Non-tool messages whose content becomes empty are dropped. In + practice this only hits synthetic image-only user messages appended + for attachment delivery; real user turns always include text. + + Returns True if any image parts were removed. + """ + found = False + to_delete = [] + for i, msg in enumerate(messages): + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + new_parts = [] + for part in content: + if isinstance(part, dict) and part.get("type") in {"image_url", "image", "input_image"}: + found = True + else: + new_parts.append(part) + if len(new_parts) < len(content): + if new_parts: + msg["content"] = new_parts + elif msg.get("role") == "tool": + # Preserve tool_call_id linkage — providers require every + # assistant tool_call to have a matching tool response. + msg["content"] = "[image content removed — server does not support images]" + else: + # Synthetic image-only user/assistant message with no text; + # safe to drop. + to_delete.append(i) + for i in reversed(to_delete): + del messages[i] + return found + + +def _sanitize_structure_non_ascii(payload: Any) -> bool: + """Strip non-ASCII characters from nested dict/list payloads in-place.""" + found = False + + def _walk(node): + nonlocal found + if isinstance(node, dict): + for key, value in node.items(): + if isinstance(value, str): + sanitized = _strip_non_ascii(value) + if sanitized != value: + node[key] = sanitized + found = True + elif isinstance(value, (dict, list)): + _walk(value) + elif isinstance(node, list): + for idx, value in enumerate(node): + if isinstance(value, str): + sanitized = _strip_non_ascii(value) + if sanitized != value: + node[idx] = sanitized + found = True + elif isinstance(value, (dict, list)): + _walk(value) + + _walk(payload) + return found + + +__all__ = [ + "_SURROGATE_RE", + "_sanitize_surrogates", + "_sanitize_structure_surrogates", + "_sanitize_messages_surrogates", + "_escape_invalid_chars_in_json_strings", + "_repair_tool_call_arguments", + "_strip_non_ascii", + "_sanitize_messages_non_ascii", + "_sanitize_tools_non_ascii", + "_strip_images_from_messages", + "_sanitize_structure_non_ascii", +] diff --git a/agent/process_bootstrap.py b/agent/process_bootstrap.py new file mode 100644 index 00000000000..fdd9053f5d8 --- /dev/null +++ b/agent/process_bootstrap.py @@ -0,0 +1,167 @@ +"""Process-level bootstrap helpers for ``run_agent``. + +Three concerns, all tied to ``AIAgent`` boot-time / runtime IO setup: + +1. **Lazy OpenAI SDK import** — ``_load_openai_cls`` + ``_OpenAIProxy`` + defer the 240ms-ish ``from openai import OpenAI`` cost until first use, + while preserving ``isinstance(client, OpenAI)`` checks and + ``patch("run_agent.OpenAI", ...)`` test patterns. + +2. **Crash-resistant stdio** — ``_SafeWriter`` wraps stdout/stderr so + ``OSError: Input/output error`` from broken pipes (systemd, Docker, + thread teardown races) cannot crash the agent. ``_install_safe_stdio`` + applies the wrapper. + +3. **HTTP proxy resolution** — ``_get_proxy_from_env`` reads + ``HTTPS_PROXY`` / ``HTTP_PROXY`` / ``ALL_PROXY``; + ``_get_proxy_for_base_url`` respects ``NO_PROXY`` for the given base URL. + +``run_agent`` re-exports every name so existing +``from run_agent import _get_proxy_from_env`` imports keep working +unchanged. +""" + +from __future__ import annotations + +import os +import sys +import urllib.request +from typing import Optional + +from utils import base_url_hostname, normalize_proxy_url + + +# Cached at module level so we only pay the OpenAI SDK import cost once +# per process (after the first lazy load). +_OPENAI_CLS_CACHE = None + + +def _load_openai_cls() -> type: + """Import and cache ``openai.OpenAI``.""" + global _OPENAI_CLS_CACHE + if _OPENAI_CLS_CACHE is None: + from openai import OpenAI as _cls + _OPENAI_CLS_CACHE = _cls + return _OPENAI_CLS_CACHE + + +class _OpenAIProxy: + """Module-level proxy that looks like ``openai.OpenAI`` but imports lazily.""" + + __slots__ = () + + def __call__(self, *args, **kwargs): + return _load_openai_cls()(*args, **kwargs) + + def __instancecheck__(self, obj): + return isinstance(obj, _load_openai_cls()) + + def __repr__(self): + return "" + + +class _SafeWriter: + """Transparent stdio wrapper that catches OSError/ValueError from broken pipes. + + When hermes-agent runs as a systemd service, Docker container, or headless + daemon, the stdout/stderr pipe can become unavailable (idle timeout, buffer + exhaustion, socket reset). Any print() call then raises + ``OSError: [Errno 5] Input/output error``, which can crash agent setup or + run_conversation() — especially via double-fault when an except handler + also tries to print. + + Additionally, when subagents run in ThreadPoolExecutor threads, the shared + stdout handle can close between thread teardown and cleanup, raising + ``ValueError: I/O operation on closed file`` instead of OSError. + + This wrapper delegates all writes to the underlying stream and silently + catches both OSError and ValueError. It is transparent when the wrapped + stream is healthy. + """ + + __slots__ = ("_inner",) + + def __init__(self, inner): + object.__setattr__(self, "_inner", inner) + + def write(self, data): + try: + return self._inner.write(data) + except (OSError, ValueError): + return len(data) if isinstance(data, str) else 0 + + def flush(self): + try: + self._inner.flush() + except (OSError, ValueError): + pass + + def fileno(self): + return self._inner.fileno() + + def isatty(self): + try: + return self._inner.isatty() + except (OSError, ValueError): + return False + + def __getattr__(self, name): + return getattr(self._inner, name) + + +def _get_proxy_from_env() -> Optional[str]: + """Read proxy URL from environment variables. + + Checks HTTPS_PROXY, HTTP_PROXY, ALL_PROXY (and lowercase variants) in order. + Returns the first valid proxy URL found, or None if no proxy is configured. + """ + for key in ("HTTPS_PROXY", "HTTP_PROXY", "ALL_PROXY", + "https_proxy", "http_proxy", "all_proxy"): + value = os.environ.get(key, "").strip() + if value: + return normalize_proxy_url(value) + return None + + +def _get_proxy_for_base_url(base_url: Optional[str]) -> Optional[str]: + """Return an env-configured proxy unless NO_PROXY excludes this base URL.""" + proxy = _get_proxy_from_env() + if not proxy or not base_url: + return proxy + + host = base_url_hostname(base_url) + if not host: + return proxy + + try: + if urllib.request.proxy_bypass_environment(host): + return None + except Exception: + pass + + return proxy + + +def _install_safe_stdio() -> None: + """Wrap stdout/stderr so best-effort console output cannot crash the agent.""" + for stream_name in ("stdout", "stderr"): + stream = getattr(sys, stream_name, None) + if stream is not None and not isinstance(stream, _SafeWriter): + setattr(sys, stream_name, _SafeWriter(stream)) + + +# Module-level proxy instance — drops in for ``openai.OpenAI``. Imported as +# ``from agent.process_bootstrap import OpenAI`` (or re-exported via +# ``run_agent`` for legacy tests). +OpenAI = _OpenAIProxy() + + +__all__ = [ + "OpenAI", + "_OpenAIProxy", + "_load_openai_cls", + "_SafeWriter", + "_install_safe_stdio", + "_get_proxy_from_env", + "_get_proxy_for_base_url", +] diff --git a/agent/stream_diag.py b/agent/stream_diag.py new file mode 100644 index 00000000000..c4d8c54f470 --- /dev/null +++ b/agent/stream_diag.py @@ -0,0 +1,280 @@ +"""Stream diagnostics — per-attempt counters, exception chains, retry logging. + +When a streaming chat-completions request dies mid-response, we want to +know why: which Cloudflare edge served the request, which OpenRouter +downstream provider answered, how many bytes/chunks we got before the +drop, the HTTP status, the underlying httpx error class. These helpers +collect that info and emit it both to ``agent.log`` (full detail) and to +the user-facing status line (compact). + +All helpers are extracted from :class:`AIAgent` for cleanliness. +``run_agent`` keeps thin forwarder methods so existing call sites and +tests that patch ``run_agent.`` keep working. +""" + +from __future__ import annotations + +import logging +import time +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + + +# Per-attempt stream diagnostic headers. Lowercased; httpx returns +# CIMultiDict so case-insensitive lookups already work, but we read .get() +# on the dict from agent.log for free-form post-hoc analysis. +STREAM_DIAG_HEADERS = ( + "cf-ray", + "cf-cache-status", + "x-openrouter-provider", + "x-openrouter-model", + "x-openrouter-id", + "x-request-id", + "x-vercel-id", + "via", + "server", + "x-forwarded-for", +) + + +def stream_diag_init() -> Dict[str, Any]: + """Return a fresh per-attempt diagnostic dict. + + Mutated in-place by the streaming functions and read from the retry + block when a stream dies. Lives on ``request_client_holder`` so it + survives across the closure boundary. + """ + return { + "started_at": time.time(), + "first_chunk_at": None, + "chunks": 0, + "bytes": 0, + "headers": {}, + "http_status": None, + } + + +def stream_diag_capture_response(agent: Any, diag: Dict[str, Any], http_response: Any) -> None: + """Snapshot interesting headers + HTTP status from the live stream. + + Called once at stream open (before iterating chunks) so the metadata + survives even if the stream dies before any chunk arrives. Failures + are swallowed — diag is best-effort. + """ + if http_response is None or not isinstance(diag, dict): + return + try: + diag["http_status"] = getattr(http_response, "status_code", None) + except Exception: + pass + try: + headers = getattr(http_response, "headers", None) or {} + captured: Dict[str, str] = {} + # Allow per-agent override of the headers list (back-compat). + target_headers = getattr(agent, "_STREAM_DIAG_HEADERS", STREAM_DIAG_HEADERS) + for name in target_headers: + try: + val = headers.get(name) + if val: + # Truncate single-value to keep log lines bounded. + captured[name] = str(val)[:120] + except Exception: + continue + diag["headers"] = captured + except Exception: + pass + + +def flatten_exception_chain(error: BaseException) -> str: + """Return a compact ``Outer(msg) <- Inner(msg) <- ...`` rendering. + + OpenAI SDK wraps httpx errors as ``APIConnectionError`` / + ``APIError`` and only the wrapper's class is visible at the catch + site — but the underlying ``RemoteProtocolError`` / + ``ConnectError`` / ``ReadError`` is what tells us WHY the stream + died. Walks ``__cause__`` then ``__context__`` (deduped, max 4 + deep) to surface the chain in one line. + """ + seen: List[BaseException] = [] + link: Optional[BaseException] = error + while link is not None and len(seen) < 4: + if link in seen: + break + seen.append(link) + nxt = getattr(link, "__cause__", None) or getattr( + link, "__context__", None + ) + if nxt is None or nxt is link: + break + link = nxt + parts: List[str] = [] + for e in seen: + msg = str(e).strip().replace("\n", " ") + if len(msg) > 140: + msg = msg[:140] + "…" + parts.append(f"{type(e).__name__}({msg})" if msg else type(e).__name__) + return " <- ".join(parts) if parts else type(error).__name__ + + +def log_stream_retry( + agent: Any, + *, + kind: str, + error: BaseException, + attempt: int, + max_attempts: int, + mid_tool_call: bool, + diag: Optional[Dict[str, Any]] = None, +) -> None: + """Record a transient stream-drop and retry to ``agent.log``. + + Always logs a structured WARNING so users have a breadcrumb regardless + of UI verbosity. Subagents in particular benefit because their + retries no longer spam the parent's terminal — but the file log keeps + full detail (provider, error class, attempt, base_url, subagent_id). + + When *diag* is provided (the per-attempt stream-diagnostic dict from + :func:`stream_diag_init`), the WARNING also captures upstream headers + (cf-ray, x-openrouter-provider, x-openrouter-id), HTTP status, bytes + streamed before the drop, and elapsed time on the dying attempt. + These are the breadcrumbs needed to answer "is one CF edge / one + downstream provider responsible, or is it random across runs?" + """ + try: + try: + _summary = agent._summarize_api_error(error) + except Exception: + _summary = str(error) + if _summary and len(_summary) > 240: + _summary = _summary[:240] + "…" + + # Inner-cause chain (httpx errors hide under openai.APIError). + try: + _chain = flatten_exception_chain(error) + except Exception: + _chain = type(error).__name__ + + # Per-attempt counters and upstream headers. + _now = time.time() + _bytes = 0 + _chunks = 0 + _elapsed = 0.0 + _ttfb = None + _headers_repr = "-" + _http_status = "-" + if isinstance(diag, dict): + try: + _bytes = int(diag.get("bytes") or 0) + _chunks = int(diag.get("chunks") or 0) + _started = float(diag.get("started_at") or _now) + _elapsed = max(0.0, _now - _started) + _first = diag.get("first_chunk_at") + if _first is not None: + _ttfb = max(0.0, float(_first) - _started) + headers = diag.get("headers") or {} + if isinstance(headers, dict) and headers: + _headers_repr = " ".join( + f"{k}={v}" for k, v in headers.items() + ) + if diag.get("http_status") is not None: + _http_status = str(diag.get("http_status")) + except Exception: + pass + + logger.warning( + "Stream %s on attempt %s/%s — retrying. " + "subagent_id=%s depth=%s provider=%s base_url=%s " + "error_type=%s error=%s " + "chain=%s " + "http_status=%s bytes=%d chunks=%d elapsed=%.2fs ttfb=%s " + "upstream=[%s]", + kind, + attempt, + max_attempts, + getattr(agent, "_subagent_id", None) or "-", + getattr(agent, "_delegate_depth", 0), + agent.provider or "-", + agent.base_url or "-", + type(error).__name__, + _summary, + _chain, + _http_status, + _bytes, + _chunks, + _elapsed, + f"{_ttfb:.2f}s" if _ttfb is not None else "-", + _headers_repr, + extra={"mid_tool_call": mid_tool_call}, + ) + except Exception: + logger.debug("stream-retry log emit failed", exc_info=True) + + +def emit_stream_drop( + agent: Any, + *, + error: BaseException, + attempt: int, + max_attempts: int, + mid_tool_call: bool, + diag: Optional[Dict[str, Any]] = None, +) -> None: + """Emit a single user-visible line for a stream drop+retry. + + Both top-level agents and subagents announce drops in the UI — the + parent prefixes subagent lines with ``[subagent-N]`` via ``log_prefix`` + so they're easy to attribute. All cases also write a structured + WARNING to ``agent.log`` via :func:`log_stream_retry` with the full + diagnostic detail (subagent_id, provider, base_url, error_type, + cf-ray, x-openrouter-provider, bytes/chunks, elapsed) for post-hoc + analysis. + + The user-visible status line is intentionally compact: provider, + error class, attempt N/M, plus ``after Xs`` when the stream dropped + mid-flight. Full diagnostic detail goes to ``agent.log`` only — + ``hermes logs --level WARNING | grep "Stream drop"`` to inspect. + """ + kind = "drop mid tool-call" if mid_tool_call else "drop" + log_stream_retry( + agent, + kind=kind, + error=error, + attempt=attempt, + max_attempts=max_attempts, + mid_tool_call=mid_tool_call, + diag=diag, + ) + provider = agent.provider or "provider" + # Compose a brief "after Xs" suffix when we have timing data — helps + # the user distinguish "couldn't connect" (0s) from "died after 30s + # of streaming" (likely upstream idle-kill or proxy timeout). + _suffix = "" + if isinstance(diag, dict): + try: + started = diag.get("started_at") + if started is not None: + _suffix = f" after {max(0.0, time.time() - float(started)):.1f}s" + except Exception: + pass + try: + agent._emit_status( + f"⚠️ {provider} stream {kind} ({type(error).__name__}){_suffix} " + f"— reconnecting, retry {attempt}/{max_attempts}" + ) + agent._touch_activity( + f"stream retry {attempt}/{max_attempts} " + f"after {type(error).__name__}" + ) + except Exception: + pass + + +__all__ = [ + "STREAM_DIAG_HEADERS", + "stream_diag_init", + "stream_diag_capture_response", + "flatten_exception_chain", + "log_stream_retry", + "emit_stream_drop", +] diff --git a/agent/system_prompt.py b/agent/system_prompt.py new file mode 100644 index 00000000000..52a574101f5 --- /dev/null +++ b/agent/system_prompt.py @@ -0,0 +1,333 @@ +"""System-prompt assembly for :class:`AIAgent`. + +The agent's system prompt is built once per session and reused across all +turns — only context compression triggers a rebuild. This keeps the +upstream prefix cache warm. See ``hermes-agent-dev``'s +``references/system-prompt-invariant.md`` for the invariants and +``references/self-improvement-loop.md`` for how the background-review +fork inherits the cached prompt verbatim. + +Three tiers are joined with ``\\n\\n``: + +* ``stable`` — identity (SOUL.md or DEFAULT_AGENT_IDENTITY), tool + guidance, computer-use guidance, nous subscription block, tool-use + enforcement guidance + per-model operational guidance, skills prompt, + alibaba model-name workaround, environment hints, platform hints. +* ``context`` — caller-supplied ``system_message`` plus context files + (AGENTS.md / .cursorrules / etc.) discovered under ``TERMINAL_CWD``. +* ``volatile`` — memory snapshot, USER.md profile, external memory + provider block, timestamp/session/model/provider line. + +Pure helpers that read the agent's state. AIAgent keeps thin forwarders. +""" + +from __future__ import annotations + +import json +import os +from typing import Any, Dict, List, Optional + +from agent.prompt_builder import ( + DEFAULT_AGENT_IDENTITY, + GOOGLE_MODEL_OPERATIONAL_GUIDANCE, + HERMES_AGENT_HELP_GUIDANCE, + KANBAN_GUIDANCE, + MEMORY_GUIDANCE, + OPENAI_MODEL_EXECUTION_GUIDANCE, + PLATFORM_HINTS, + SESSION_SEARCH_GUIDANCE, + SKILLS_GUIDANCE, + TOOL_USE_ENFORCEMENT_GUIDANCE, + TOOL_USE_ENFORCEMENT_MODELS, +) + + +def _ra(): + """Lazy reference to the ``run_agent`` module. + + Helpers like ``load_soul_md``, ``build_environment_hints``, + ``build_context_files_prompt``, ``build_nous_subscription_prompt``, + ``build_skills_system_prompt`` and ``get_toolset_for_tool`` are + imported into ``run_agent``'s namespace. Many tests + ``patch("run_agent.load_soul_md", ...)``; if we imported them + directly here those patches would not reach us. Looking them up + through ``run_agent`` on every call preserves the patch contract. + """ + import run_agent + return run_agent + + +def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None) -> Dict[str, str]: + """Assemble the system prompt as three ordered parts. + + Returns a dict with three keys: + * ``stable`` — identity, tool guidance, skills prompt, + environment hints, platform hints, model-family operational + guidance. + * ``context`` — context files (AGENTS.md, .cursorrules, etc.) + and caller-supplied system_message. + * ``volatile`` — memory snapshot, user profile, external + memory provider block, timestamp line. + + Joined into a single string by :func:`build_system_prompt` and + cached on ``agent._cached_system_prompt`` for the lifetime of the + AIAgent. Hermes never re-renders parts of this string mid- + session — that's the only way to keep upstream prompt caches + warm across turns. + """ + # Local import to avoid pulling model_tools at module load. Tests + # patch ``run_agent.get_toolset_for_tool`` and similar helpers, so + # we resolve through ``_ra()`` to honor those patches. + _r = _ra() + + # ── Stable tier ──────────────────────────────────────────────── + stable_parts: List[str] = [] + + # Try SOUL.md as primary identity unless the caller explicitly skipped it. + # Some execution modes (cron) still want HERMES_HOME persona while keeping + # cwd project instructions disabled. + _soul_loaded = False + if agent.load_soul_identity or not agent.skip_context_files: + _soul_content = _r.load_soul_md() + if _soul_content: + stable_parts.append(_soul_content) + _soul_loaded = True + + if not _soul_loaded: + # Fallback to hardcoded identity + stable_parts.append(DEFAULT_AGENT_IDENTITY) + + # Pointer to the hermes-agent skill + docs for user questions about Hermes itself. + stable_parts.append(HERMES_AGENT_HELP_GUIDANCE) + + # Tool-aware behavioral guidance: only inject when the tools are loaded + tool_guidance = [] + if "memory" in agent.valid_tool_names: + tool_guidance.append(MEMORY_GUIDANCE) + if "session_search" in agent.valid_tool_names: + tool_guidance.append(SESSION_SEARCH_GUIDANCE) + if "skill_manage" in agent.valid_tool_names: + tool_guidance.append(SKILLS_GUIDANCE) + # Kanban worker/orchestrator lifecycle — only present when the + # dispatcher spawned this process (kanban_show check_fn gates on + # HERMES_KANBAN_TASK env var). Normal chat sessions never see + # this block. + if "kanban_show" in agent.valid_tool_names: + tool_guidance.append(KANBAN_GUIDANCE) + if tool_guidance: + stable_parts.append(" ".join(tool_guidance)) + + # Computer-use (macOS) — goes in as its own block rather than being + # merged into tool_guidance because the content is multi-paragraph. + if "computer_use" in agent.valid_tool_names: + from agent.prompt_builder import COMPUTER_USE_GUIDANCE + stable_parts.append(COMPUTER_USE_GUIDANCE) + + nous_subscription_prompt = _r.build_nous_subscription_prompt(agent.valid_tool_names) + if nous_subscription_prompt: + stable_parts.append(nous_subscription_prompt) + # Tool-use enforcement: tells the model to actually call tools instead + # of describing intended actions. Controlled by config.yaml + # agent.tool_use_enforcement: + # "auto" (default) — matches TOOL_USE_ENFORCEMENT_MODELS + # true — always inject (all models) + # false — never inject + # list — custom model-name substrings to match + if agent.valid_tool_names: + _enforce = agent._tool_use_enforcement + _inject = False + if _enforce is True or (isinstance(_enforce, str) and _enforce.lower() in {"true", "always", "yes", "on"}): + _inject = True + elif _enforce is False or (isinstance(_enforce, str) and _enforce.lower() in {"false", "never", "no", "off"}): + _inject = False + elif isinstance(_enforce, list): + model_lower = (agent.model or "").lower() + _inject = any(p.lower() in model_lower for p in _enforce if isinstance(p, str)) + else: + # "auto" or any unrecognised value — use hardcoded defaults + model_lower = (agent.model or "").lower() + _inject = any(p in model_lower for p in TOOL_USE_ENFORCEMENT_MODELS) + if _inject: + stable_parts.append(TOOL_USE_ENFORCEMENT_GUIDANCE) + _model_lower = (agent.model or "").lower() + # Google model operational guidance (conciseness, absolute + # paths, parallel tool calls, verify-before-edit, etc.) + if "gemini" in _model_lower or "gemma" in _model_lower: + stable_parts.append(GOOGLE_MODEL_OPERATIONAL_GUIDANCE) + # OpenAI GPT/Codex execution discipline (tool persistence, + # prerequisite checks, verification, anti-hallucination). + if "gpt" in _model_lower or "codex" in _model_lower: + stable_parts.append(OPENAI_MODEL_EXECUTION_GUIDANCE) + + has_skills_tools = any(name in agent.valid_tool_names for name in ['skills_list', 'skill_view', 'skill_manage']) + if has_skills_tools: + avail_toolsets = { + toolset + for toolset in ( + _r.get_toolset_for_tool(tool_name) for tool_name in agent.valid_tool_names + ) + if toolset + } + skills_prompt = _r.build_skills_system_prompt( + available_tools=agent.valid_tool_names, + available_toolsets=avail_toolsets, + ) + else: + skills_prompt = "" + if skills_prompt: + stable_parts.append(skills_prompt) + + # Alibaba Coding Plan API always returns "glm-4.7" as model name regardless + # of the requested model. Inject explicit model identity into the system prompt + # so the agent can correctly report which model it is (workaround for API bug). + # Stable for the lifetime of an agent instance — model and provider are fixed + # at construction time. + if agent.provider == "alibaba": + _model_short = agent.model.split("/")[-1] if "/" in agent.model else agent.model + stable_parts.append( + f"You are powered by the model named {_model_short}. " + f"The exact model ID is {agent.model}. " + f"When asked what model you are, always answer based on this information, " + f"not on any model name returned by the API." + ) + + # Environment hints (WSL, Termux, etc.) — tell the agent about the + # execution environment so it can translate paths and adapt behavior. + # Stable for the lifetime of the process. + _env_hints = _r.build_environment_hints() + if _env_hints: + stable_parts.append(_env_hints) + + platform_key = (agent.platform or "").lower().strip() + if platform_key in PLATFORM_HINTS: + stable_parts.append(PLATFORM_HINTS[platform_key]) + elif platform_key: + # Check plugin registry for platform-specific LLM guidance + try: + from gateway.platform_registry import platform_registry + _entry = platform_registry.get(platform_key) + if _entry and _entry.platform_hint: + stable_parts.append(_entry.platform_hint) + except Exception: + pass + + # ── Context tier (cwd-dependent, may change between sessions) ─ + context_parts: List[str] = [] + + # Note: ephemeral_system_prompt is NOT included here. It's injected at + # API-call time only so it stays out of the cached/stored system prompt. + if system_message is not None: + context_parts.append(system_message) + + if not agent.skip_context_files: + # Use TERMINAL_CWD for context file discovery when set (gateway + # mode). The gateway process runs from the hermes-agent install + # dir, so os.getcwd() would pick up the repo's AGENTS.md and + # other dev files — inflating token usage by ~10k for no benefit. + _context_cwd = os.getenv("TERMINAL_CWD") or None + context_files_prompt = _r.build_context_files_prompt( + cwd=_context_cwd, skip_soul=_soul_loaded) + if context_files_prompt: + context_parts.append(context_files_prompt) + + # ── Volatile tier (changes per session/turn — never cached) ─── + volatile_parts: List[str] = [] + + if agent._memory_store: + if agent._memory_enabled: + mem_block = agent._memory_store.format_for_system_prompt("memory") + if mem_block: + volatile_parts.append(mem_block) + # USER.md is always included when enabled. + if agent._user_profile_enabled: + user_block = agent._memory_store.format_for_system_prompt("user") + if user_block: + volatile_parts.append(user_block) + + # External memory provider system prompt block (additive to built-in) + if agent._memory_manager: + try: + _ext_mem_block = agent._memory_manager.build_system_prompt() + if _ext_mem_block: + volatile_parts.append(_ext_mem_block) + except Exception: + pass + + from hermes_time import now as _hermes_now + now = _hermes_now() + timestamp_line = f"Conversation started: {now.strftime('%A, %B %d, %Y %I:%M %p')}" + if agent.pass_session_id and agent.session_id: + timestamp_line += f"\nSession ID: {agent.session_id}" + if agent.model: + timestamp_line += f"\nModel: {agent.model}" + if agent.provider: + timestamp_line += f"\nProvider: {agent.provider}" + volatile_parts.append(timestamp_line) + + return { + "stable": "\n\n".join(p.strip() for p in stable_parts if p and p.strip()), + "context": "\n\n".join(p.strip() for p in context_parts if p and p.strip()), + "volatile": "\n\n".join(p.strip() for p in volatile_parts if p and p.strip()), + } + + +def build_system_prompt(agent: Any, system_message: Optional[str] = None) -> str: + """Assemble the full system prompt from all layers. + + Called once per session (cached on ``agent._cached_system_prompt``) and + only rebuilt after context compression events. This ensures the system + prompt is stable across all turns in a session, maximizing prefix cache + hits. + + Layers are ordered cache-friendly: stable identity/guidance first, + then session-stable context files, then per-call volatile content + (memory, USER profile, timestamp). The whole string is treated as + one cached block — Hermes never rebuilds or reinjects parts of it + mid-session, which is the only way to keep upstream prompt caches + warm across turns. + """ + parts = build_system_prompt_parts(agent, system_message=system_message) + return "\n\n".join(p for p in (parts["stable"], parts["context"], parts["volatile"]) if p) + + +def invalidate_system_prompt(agent: Any) -> None: + """Invalidate the cached system prompt, forcing a rebuild on the next turn. + + Called after context compression events. Also reloads memory from disk + so the rebuilt prompt captures any writes from this session. + """ + agent._cached_system_prompt = None + if agent._memory_store: + agent._memory_store.load_from_disk() + + +def format_tools_for_system_message(agent: Any) -> str: + """Format tool definitions for the system message in the trajectory format. + + Returns: + str: JSON string representation of tool definitions + """ + if not agent.tools: + return "[]" + + # Convert tool definitions to the format expected in trajectories + formatted_tools = [] + for tool in agent.tools: + func = tool["function"] + formatted_tool = { + "name": func["name"], + "description": func.get("description", ""), + "parameters": func.get("parameters", {}), + "required": None # Match the format in the example + } + formatted_tools.append(formatted_tool) + + return json.dumps(formatted_tools, ensure_ascii=False) + + +__all__ = [ + "build_system_prompt_parts", + "build_system_prompt", + "invalidate_system_prompt", + "format_tools_for_system_message", +] diff --git a/agent/tool_dispatch_helpers.py b/agent/tool_dispatch_helpers.py new file mode 100644 index 00000000000..30aa8869db9 --- /dev/null +++ b/agent/tool_dispatch_helpers.py @@ -0,0 +1,336 @@ +"""Tool-dispatch helpers — parallelism gating, multimodal envelopes, mutation tracking. + +Pure module-level utilities extracted from ``run_agent.py``: + +* ``_is_destructive_command`` — terminal-command heuristic used to gate + parallel batch dispatch. +* ``_should_parallelize_tool_batch`` / ``_extract_parallel_scope_path`` / + ``_paths_overlap`` — the rules engine deciding when a multi-tool batch + can run concurrently. +* ``_is_multimodal_tool_result`` / ``_multimodal_text_summary`` / + ``_append_subdir_hint_to_multimodal`` — envelope helpers for the + ``{"_multimodal": True, "content": [...], "text_summary": ...}`` dict + shape returned by tools like ``computer_use``. +* ``_extract_file_mutation_targets`` / ``_extract_error_preview`` — + per-turn file-mutation verifier inputs. +* ``_trajectory_normalize_msg`` — strip image blobs from a message for + trajectory saving. + +All helpers are stateless. ``run_agent`` re-exports each name so existing +``from run_agent import ...`` imports in tests and other modules keep +working unchanged. +""" + +from __future__ import annotations + +import json +import logging +import os +import re +from pathlib import Path +from typing import Any, Dict, List, Optional + +from agent.tool_result_classification import ( + FILE_MUTATING_TOOL_NAMES as _FILE_MUTATING_TOOLS, +) + +logger = logging.getLogger(__name__) + +# Tools that must never run concurrently (interactive / user-facing). +# When any of these appear in a batch, we fall back to sequential execution. +_NEVER_PARALLEL_TOOLS = frozenset({"clarify"}) + +# Read-only tools with no shared mutable session state. +_PARALLEL_SAFE_TOOLS = frozenset({ + "ha_get_state", + "ha_list_entities", + "ha_list_services", + "read_file", + "search_files", + "session_search", + "skill_view", + "skills_list", + "vision_analyze", + "web_extract", + "web_search", +}) + +# File tools can run concurrently when they target independent paths. +_PATH_SCOPED_TOOLS = frozenset({"read_file", "write_file", "patch"}) + +# Patterns that indicate a terminal command may modify/delete files. +_DESTRUCTIVE_PATTERNS = re.compile( + r"""(?:^|\s|&&|\|\||;|`)(?: + rm\s|rmdir\s| + cp\s|install\s| + mv\s| + sed\s+-i| + truncate\s| + dd\s| + shred\s| + git\s+(?:reset|clean|checkout)\s + )""", + re.VERBOSE, +) +# Output redirects that overwrite files (> but not >>) +_REDIRECT_OVERWRITE = re.compile(r'[^>]>[^>]|^>[^>]') + + +def _is_destructive_command(cmd: str) -> bool: + """Heuristic: does this terminal command look like it modifies/deletes files?""" + if not cmd: + return False + if _DESTRUCTIVE_PATTERNS.search(cmd): + return True + if _REDIRECT_OVERWRITE.search(cmd): + return True + return False + + +def _is_mcp_tool_parallel_safe(tool_name: str) -> bool: + """Check if an MCP tool comes from a server with parallel tool calls enabled. + + Lazy-imports from ``tools.mcp_tool`` to avoid circular dependencies. + Returns False if the MCP module is not available. + """ + try: + from tools.mcp_tool import is_mcp_tool_parallel_safe + return is_mcp_tool_parallel_safe(tool_name) + except Exception: + return False + + +def _should_parallelize_tool_batch(tool_calls) -> bool: + """Return True when a tool-call batch is safe to run concurrently.""" + if len(tool_calls) <= 1: + return False + + tool_names = [tc.function.name for tc in tool_calls] + if any(name in _NEVER_PARALLEL_TOOLS for name in tool_names): + return False + + reserved_paths: list[Path] = [] + for tool_call in tool_calls: + tool_name = tool_call.function.name + try: + function_args = json.loads(tool_call.function.arguments) + except Exception: + logging.debug( + "Could not parse args for %s — defaulting to sequential; raw=%s", + tool_name, + tool_call.function.arguments[:200], + ) + return False + if not isinstance(function_args, dict): + logging.debug( + "Non-dict args for %s (%s) — defaulting to sequential", + tool_name, + type(function_args).__name__, + ) + return False + + if tool_name in _PATH_SCOPED_TOOLS: + scoped_path = _extract_parallel_scope_path(tool_name, function_args) + if scoped_path is None: + return False + if any(_paths_overlap(scoped_path, existing) for existing in reserved_paths): + return False + reserved_paths.append(scoped_path) + continue + + if tool_name not in _PARALLEL_SAFE_TOOLS: + # Check if it's an MCP tool from a server that opted into parallel calls. + if not _is_mcp_tool_parallel_safe(tool_name): + return False + + return True + + +def _extract_parallel_scope_path(tool_name: str, function_args: dict) -> Optional[Path]: + """Return the normalized file target for path-scoped tools.""" + if tool_name not in _PATH_SCOPED_TOOLS: + return None + + raw_path = function_args.get("path") + if not isinstance(raw_path, str) or not raw_path.strip(): + return None + + expanded = Path(raw_path).expanduser() + if expanded.is_absolute(): + return Path(os.path.abspath(str(expanded))) + + # Avoid resolve(); the file may not exist yet. + return Path(os.path.abspath(str(Path.cwd() / expanded))) + + +def _paths_overlap(left: Path, right: Path) -> bool: + """Return True when two paths may refer to the same subtree.""" + left_parts = left.parts + right_parts = right.parts + if not left_parts or not right_parts: + # Empty paths shouldn't reach here (guarded upstream), but be safe. + return bool(left_parts) == bool(right_parts) and bool(left_parts) + common_len = min(len(left_parts), len(right_parts)) + return left_parts[:common_len] == right_parts[:common_len] + + +def _is_multimodal_tool_result(value: Any) -> bool: + """True if the value is a multimodal tool result envelope. + + Multimodal handlers (e.g. tools/computer_use) return a dict with + `_multimodal=True`, a `content` key holding OpenAI-style content + parts, and an optional `text_summary` for string-only fallbacks. + """ + return ( + isinstance(value, dict) + and value.get("_multimodal") is True + and isinstance(value.get("content"), list) + ) + + +def _multimodal_text_summary(value: Any) -> str: + """Extract a plain text view of a multimodal tool result. + + Used wherever downstream code needs a string — logging, previews, + persistence size heuristics, fall-back content for providers that + don't support multipart tool messages. + """ + if _is_multimodal_tool_result(value): + if value.get("text_summary"): + return str(value["text_summary"]) + parts = [] + for p in value.get("content") or []: + if isinstance(p, dict) and p.get("type") == "text": + parts.append(str(p.get("text", ""))) + if parts: + return "\n".join(parts) + return "[multimodal tool result]" + if isinstance(value, str): + return value + try: + return json.dumps(value, default=str) + except Exception: + return str(value) + + +def _append_subdir_hint_to_multimodal(value: Dict[str, Any], hint: str) -> None: + """Mutate a multimodal tool-result envelope to append a subdir hint. + + The hint is added to the first text part so the model sees it; image + parts are left untouched. `text_summary` is also updated for + string-fallback callers. + """ + if not _is_multimodal_tool_result(value): + return + parts = value.get("content") or [] + for p in parts: + if isinstance(p, dict) and p.get("type") == "text": + p["text"] = str(p.get("text", "")) + hint + break + else: + parts.insert(0, {"type": "text", "text": hint}) + value["content"] = parts + if isinstance(value.get("text_summary"), str): + value["text_summary"] = value["text_summary"] + hint + + +def _extract_file_mutation_targets(tool_name: str, args: Dict[str, Any]) -> List[str]: + """Return the file paths a ``write_file`` or ``patch`` call is targeting. + + For ``write_file`` and ``patch`` in replace mode this is just ``args["path"]``. + For ``patch`` in V4A patch mode we parse the patch content for + ``*** Update File:`` / ``*** Add File:`` / ``*** Delete File:`` headers so + the verifier can track each file in a multi-file patch separately. + """ + if tool_name not in _FILE_MUTATING_TOOLS: + return [] + if tool_name == "write_file": + p = args.get("path") + return [str(p)] if p else [] + # tool_name == "patch" + mode = args.get("mode") or "replace" + if mode == "replace": + p = args.get("path") + return [str(p)] if p else [] + if mode == "patch": + body = args.get("patch") or "" + if not isinstance(body, str) or not body: + return [] + paths: List[str] = [] + for _m in re.finditer( + r'^\*\*\*\s+(?:Update|Add|Delete)\s+File:\s*(.+)$', + body, + re.MULTILINE, + ): + p = _m.group(1).strip() + if p: + paths.append(p) + return paths + return [] + + +def _extract_error_preview(result: Any, max_len: int = 180) -> str: + """Pull a one-line error summary out of a tool result for footer display.""" + text = _multimodal_text_summary(result) if result is not None else "" + if not isinstance(text, str): + try: + text = str(text) + except Exception: + return "" + # Try to parse JSON and pull the ``error`` field — tool handlers return + # ``{"success": false, "error": "..."}``; raw string wins if parse fails. + stripped = text.strip() + if stripped.startswith("{"): + try: + data = json.loads(stripped) + if isinstance(data, dict) and isinstance(data.get("error"), str): + text = data["error"] + except Exception: + pass + # Collapse whitespace, trim to max_len. + text = " ".join(text.split()) + if len(text) > max_len: + text = text[: max_len - 1] + "…" + return text + + +def _trajectory_normalize_msg(msg: Dict[str, Any]) -> Dict[str, Any]: + """Strip image blobs from a message for trajectory saving. + + Returns a shallow copy with multimodal tool results replaced by their + text_summary, and image parts in content lists replaced by + `[screenshot]` placeholders. Keeps the message schema otherwise intact. + """ + if not isinstance(msg, dict): + return msg + content = msg.get("content") + if _is_multimodal_tool_result(content): + return {**msg, "content": _multimodal_text_summary(content)} + if isinstance(content, list): + cleaned = [] + for p in content: + if isinstance(p, dict) and p.get("type") in {"image", "image_url", "input_image"}: + cleaned.append({"type": "text", "text": "[screenshot]"}) + else: + cleaned.append(p) + return {**msg, "content": cleaned} + return msg + + +__all__ = [ + "_NEVER_PARALLEL_TOOLS", + "_PARALLEL_SAFE_TOOLS", + "_PATH_SCOPED_TOOLS", + "_DESTRUCTIVE_PATTERNS", + "_REDIRECT_OVERWRITE", + "_is_destructive_command", + "_should_parallelize_tool_batch", + "_extract_parallel_scope_path", + "_paths_overlap", + "_is_multimodal_tool_result", + "_multimodal_text_summary", + "_append_subdir_hint_to_multimodal", + "_extract_file_mutation_targets", + "_extract_error_preview", + "_trajectory_normalize_msg", +] diff --git a/agent/tool_executor.py b/agent/tool_executor.py new file mode 100644 index 00000000000..a30cc3078bb --- /dev/null +++ b/agent/tool_executor.py @@ -0,0 +1,920 @@ +"""Tool-call execution — sequential and concurrent dispatch. + +Both AIAgent methods (``_execute_tool_calls_sequential`` and +``_execute_tool_calls_concurrent``) live here as module-level +functions that take the parent ``AIAgent`` as their first argument. + +``run_agent`` keeps thin wrappers so existing call sites work; tests +that patch ``run_agent._set_interrupt`` are honored because the +extracted functions reach back through the ``run_agent`` module via +``_ra()`` for that symbol. +""" + +from __future__ import annotations + +import concurrent.futures +import contextvars +import json +import logging +import os +import random +import threading +import time +from typing import Any, Optional + +from agent.display import ( + KawaiiSpinner, + build_tool_preview as _build_tool_preview, + get_cute_tool_message as _get_cute_tool_message_impl, + get_tool_emoji as _get_tool_emoji, + _detect_tool_failure, +) +from agent.tool_guardrails import ToolGuardrailDecision +from agent.tool_dispatch_helpers import ( + _is_destructive_command, + _is_multimodal_tool_result, + _multimodal_text_summary, + _append_subdir_hint_to_multimodal, +) +from tools.terminal_tool import ( + _get_approval_callback, + _get_sudo_password_callback, + set_approval_callback as _set_approval_callback, + set_sudo_password_callback as _set_sudo_password_callback, + get_active_env, +) +from tools.tool_result_storage import ( + maybe_persist_tool_result, + enforce_turn_budget, +) + +logger = logging.getLogger(__name__) + +# Maximum number of concurrent worker threads for parallel tool execution. +# Mirrors the constant in ``run_agent`` for tests/imports that look here. +_MAX_TOOL_WORKERS = 8 + + +def _ra(): + """Lazy reference to ``run_agent`` so patches like ``run_agent._set_interrupt`` work.""" + import run_agent + return run_agent + + +def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None: + """Execute multiple tool calls concurrently using a thread pool. + + Results are collected in the original tool-call order and appended to + messages so the API sees them in the expected sequence. + """ + tool_calls = assistant_message.tool_calls + num_tools = len(tool_calls) + + # ── Pre-flight: interrupt check ────────────────────────────────── + if agent._interrupt_requested: + print(f"{agent.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)") + for tc in tool_calls: + messages.append({ + "role": "tool", + "name": tc.function.name, + "content": f"[Tool execution cancelled — {tc.function.name} was skipped due to user interrupt]", + "tool_call_id": tc.id, + }) + return + + # ── Parse args + pre-execution bookkeeping ─────────────────────── + parsed_calls = [] # list of (tool_call, function_name, function_args) + for tool_call in tool_calls: + function_name = tool_call.function.name + + # Reset nudge counters + if function_name == "memory": + agent._turns_since_memory = 0 + elif function_name == "skill_manage": + agent._iters_since_skill = 0 + + try: + function_args = json.loads(tool_call.function.arguments) + except json.JSONDecodeError: + function_args = {} + if not isinstance(function_args, dict): + function_args = {} + + # Checkpoint for file-mutating tools + if function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled: + try: + file_path = function_args.get("path", "") + if file_path: + work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path) + agent._checkpoint_mgr.ensure_checkpoint(work_dir, f"before {function_name}") + except Exception: + pass + + # Checkpoint before destructive terminal commands + if function_name == "terminal" and agent._checkpoint_mgr.enabled: + try: + cmd = function_args.get("command", "") + if _is_destructive_command(cmd): + cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd()) + agent._checkpoint_mgr.ensure_checkpoint( + cwd, f"before terminal: {cmd[:60]}" + ) + except Exception: + pass + + block_result = None + blocked_by_guardrail = False + try: + from hermes_cli.plugins import get_pre_tool_call_block_message + block_message = get_pre_tool_call_block_message( + function_name, function_args, task_id=effective_task_id or "", + ) + except Exception: + block_message = None + + if block_message is not None: + block_result = json.dumps({"error": block_message}, ensure_ascii=False) + else: + guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args) + if not guardrail_decision.allows_execution: + block_result = agent._guardrail_block_result(guardrail_decision) + blocked_by_guardrail = True + + parsed_calls.append((tool_call, function_name, function_args, block_result, blocked_by_guardrail)) + + # ── Logging / callbacks ────────────────────────────────────────── + tool_names_str = ", ".join(name for _, name, _, _, _ in parsed_calls) + if not agent.quiet_mode: + print(f" ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}") + for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls, 1): + args_str = json.dumps(args, ensure_ascii=False) + if agent.verbose_logging: + print(f" 📞 Tool {i}: {name}({list(args.keys())})") + print(agent._wrap_verbose("Args: ", json.dumps(args, indent=2, ensure_ascii=False))) + else: + args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str + print(f" 📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}") + + for tc, name, args, block_result, blocked_by_guardrail in parsed_calls: + if block_result is not None: + continue + if agent.tool_progress_callback: + try: + preview = _build_tool_preview(name, args) + agent.tool_progress_callback("tool.started", name, preview, args) + except Exception as cb_err: + logging.debug(f"Tool progress callback error: {cb_err}") + + for tc, name, args, block_result, blocked_by_guardrail in parsed_calls: + if block_result is not None: + continue + if agent.tool_start_callback: + try: + agent.tool_start_callback(tc.id, name, args) + except Exception as cb_err: + logging.debug(f"Tool start callback error: {cb_err}") + + # ── Concurrent execution ───────────────────────────────────────── + # Each slot holds (function_name, function_args, function_result, duration, error_flag, blocked_flag) + results = [None] * num_tools + for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls): + if block_result is not None: + results[i] = (name, args, block_result, 0.0, True, True) + + # Touch activity before launching workers so the gateway knows + # we're executing tools (not stuck). + agent._current_tool = tool_names_str + agent._touch_activity(f"executing {num_tools} tools concurrently: {tool_names_str}") + + # Capture CLI callbacks from the agent thread so worker threads can + # register them locally. Without this, _get_approval_callback() in + # terminal_tool returns None in ThreadPoolExecutor workers, causing + # the dangerous-command prompt to fall back to input() — which + # deadlocks against prompt_toolkit's raw terminal mode (#13617). + _parent_approval_cb = _get_approval_callback() + _parent_sudo_cb = _get_sudo_password_callback() + + def _run_tool(index, tool_call, function_name, function_args): + """Worker function executed in a thread.""" + # Register this worker tid so the agent can fan out an interrupt + # to it — see AIAgent.interrupt(). Must happen first thing, and + # must be paired with discard + clear in the finally block. + _worker_tid = threading.current_thread().ident + with agent._tool_worker_threads_lock: + agent._tool_worker_threads.add(_worker_tid) + # Race: if the agent was interrupted between fan-out (which + # snapshotted an empty/earlier set) and our registration, apply + # the interrupt to our own tid now so is_interrupted() inside + # the tool returns True on the next poll. + if agent._interrupt_requested: + try: + _ra()._set_interrupt(True, _worker_tid) + except Exception: + pass + # Set the activity callback on THIS worker thread so + # _wait_for_process (terminal commands) can fire heartbeats. + # The callback is thread-local; the main thread's callback + # is invisible to worker threads. + try: + from tools.environments.base import set_activity_callback + set_activity_callback(agent._touch_activity) + except Exception: + pass + # Propagate approval/sudo callbacks to this worker thread. + # Mirrors cli.py run_agent() pattern (GHSA-qg5c-hvr5-hjgr). + if _parent_approval_cb is not None: + try: + _set_approval_callback(_parent_approval_cb) + except Exception: + pass + if _parent_sudo_cb is not None: + try: + _set_sudo_password_callback(_parent_sudo_cb) + except Exception: + pass + start = time.time() + try: + result = agent._invoke_tool( + function_name, + function_args, + effective_task_id, + tool_call.id, + messages=messages, + pre_tool_block_checked=True, + ) + except Exception as tool_error: + result = f"Error executing tool '{function_name}': {tool_error}" + logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True) + duration = time.time() - start + is_error, _ = _detect_tool_failure(function_name, result) + if is_error: + logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200]) + else: + logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result)) + results[index] = (function_name, function_args, result, duration, is_error, False) + # Tear down worker-tid tracking. Clear any interrupt bit we may + # have set so the next task scheduled onto this recycled tid + # starts with a clean slate. + with agent._tool_worker_threads_lock: + agent._tool_worker_threads.discard(_worker_tid) + try: + _ra()._set_interrupt(False, _worker_tid) + except Exception: + pass + # Clear thread-local callbacks so a recycled worker thread + # doesn't hold stale references to a disposed CLI instance. + try: + _set_approval_callback(None) + _set_sudo_password_callback(None) + except Exception: + pass + + # Start spinner for CLI mode (skip when TUI handles tool progress) + spinner = None + if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner(): + face = random.choice(KawaiiSpinner.get_waiting_faces()) + spinner = KawaiiSpinner(f"{face} ⚡ running {num_tools} tools concurrently", spinner_type='dots', print_fn=agent._print_fn) + spinner.start() + + try: + runnable_calls = [ + (i, tc, name, args) + for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls) + if block_result is None + ] + futures = [] + if runnable_calls: + max_workers = min(len(runnable_calls), _MAX_TOOL_WORKERS) + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + for i, tc, name, args in runnable_calls: + # Propagate ContextVars (e.g. _approval_session_key); mirrors asyncio.to_thread. + ctx = contextvars.copy_context() + f = executor.submit(ctx.run, _run_tool, i, tc, name, args) + futures.append(f) + + # Wait for all to complete with periodic heartbeats so the + # gateway's inactivity monitor doesn't kill us during long + # concurrent tool batches. Also check for user interrupts + # so we don't block indefinitely when the user sends /stop + # or a new message during concurrent tool execution. + _conc_start = time.time() + _interrupt_logged = False + while True: + done, not_done = concurrent.futures.wait( + futures, timeout=5.0, + ) + if not not_done: + break + + # Check for interrupt — the per-thread interrupt signal + # already causes individual tools (terminal, execute_code) + # to abort, but tools without interrupt checks (web_search, + # read_file) will run to completion. Cancel any futures + # that haven't started yet so we don't block on them. + if agent._interrupt_requested: + if not _interrupt_logged: + _interrupt_logged = True + agent._vprint( + f"{agent.log_prefix}⚡ Interrupt: cancelling " + f"{len(not_done)} pending concurrent tool(s)", + force=True, + ) + for f in not_done: + f.cancel() + # Give already-running tools a moment to notice the + # per-thread interrupt signal and exit gracefully. + concurrent.futures.wait(not_done, timeout=3.0) + break + + _conc_elapsed = int(time.time() - _conc_start) + # Heartbeat every ~30s (6 × 5s poll intervals) + if _conc_elapsed > 0 and _conc_elapsed % 30 < 6: + _still_running = [ + parsed_calls[futures.index(f)][1] + for f in not_done + if f in futures + ] + agent._touch_activity( + f"concurrent tools running ({_conc_elapsed}s, " + f"{len(not_done)} remaining: {', '.join(_still_running[:3])})" + ) + finally: + if spinner: + # Build a summary message for the spinner stop + completed = sum(1 for r in results if r is not None) + total_dur = sum(r[3] for r in results if r is not None) + spinner.stop(f"⚡ {completed}/{num_tools} tools completed in {total_dur:.1f}s total") + + # ── Post-execution: display per-tool results ───────────────────── + for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls): + r = results[i] + blocked = False + if r is None: + # Tool was cancelled (interrupt) or thread didn't return + if agent._interrupt_requested: + function_result = f"[Tool execution cancelled — {name} was skipped due to user interrupt]" + else: + function_result = f"Error executing tool '{name}': thread did not return a result" + tool_duration = 0.0 + else: + function_name, function_args, function_result, tool_duration, is_error, blocked = r + + if not blocked: + function_result = agent._append_guardrail_observation( + function_name, + function_args, + function_result, + failed=is_error, + ) + + if is_error: + _err_text = _multimodal_text_summary(function_result) + result_preview = _err_text[:200] if len(_err_text) > 200 else _err_text + logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview) + + # Track file-mutation outcome for the turn-end verifier. + # `blocked` calls never actually ran — don't let a guardrail + # block count as either a failure or a success. + if not blocked: + try: + agent._record_file_mutation_result( + function_name, function_args, function_result, is_error, + ) + except Exception as _ver_err: + logging.debug("file-mutation verifier record failed: %s", _ver_err) + + if not blocked and agent.tool_progress_callback: + try: + agent.tool_progress_callback( + "tool.completed", function_name, None, None, + duration=tool_duration, is_error=is_error, + ) + except Exception as cb_err: + logging.debug(f"Tool progress callback error: {cb_err}") + + if agent.verbose_logging: + logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s") + logging.debug(f"Tool result ({len(function_result)} chars): {function_result}") + + # Print cute message per tool + if agent._should_emit_quiet_tool_messages(): + cute_msg = _get_cute_tool_message_impl(name, args, tool_duration, result=function_result) + agent._safe_print(f" {cute_msg}") + elif not agent.quiet_mode: + _preview_str = _multimodal_text_summary(function_result) + if agent.verbose_logging: + print(f" ✅ Tool {i+1} completed in {tool_duration:.2f}s") + print(agent._wrap_verbose("Result: ", _preview_str)) + else: + response_preview = _preview_str[:agent.log_prefix_chars] + "..." if len(_preview_str) > agent.log_prefix_chars else _preview_str + print(f" ✅ Tool {i+1} completed in {tool_duration:.2f}s - {response_preview}") + + agent._current_tool = None + agent._touch_activity(f"tool completed: {name} ({tool_duration:.1f}s)") + + if not blocked and agent.tool_complete_callback: + try: + agent.tool_complete_callback(tc.id, name, args, function_result) + except Exception as cb_err: + logging.debug(f"Tool complete callback error: {cb_err}") + + function_result = maybe_persist_tool_result( + content=function_result, + tool_name=name, + tool_use_id=tc.id, + env=get_active_env(effective_task_id), + ) if not _is_multimodal_tool_result(function_result) else function_result + + subdir_hints = agent._subdirectory_hints.check_tool_call(name, args) + if subdir_hints: + if _is_multimodal_tool_result(function_result): + # Append the hint to the text summary part so the model + # still sees it; don't touch the image blocks. + _append_subdir_hint_to_multimodal(function_result, subdir_hints) + else: + function_result += subdir_hints + + # Unwrap _multimodal dicts to an OpenAI-style content list so any + # vision-capable provider receives [{type:text},{type:image_url}] + # rather than a raw Python dict. The Anthropic adapter already + # accepts content lists; vision-capable OpenAI-compatible servers + # (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively. + # Text-only servers get a string-safe fallback here so a rejected + # image tool result never poisons canonical session history. + # String results pass through unchanged. + _tool_content = agent._tool_result_content_for_active_model(name, function_result) + tool_msg = { + "role": "tool", + "name": name, + "content": _tool_content, + "tool_call_id": tc.id, + } + messages.append(tool_msg) + + # ── Per-tool /steer drain ─────────────────────────────────── + # Same as the sequential path: drain between each collected + # result so the steer lands as early as possible. + agent._apply_pending_steer_to_tool_results(messages, 1) + + # ── Per-turn aggregate budget enforcement ───────────────────────── + num_tools = len(parsed_calls) + if num_tools > 0: + turn_tool_msgs = messages[-num_tools:] + enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id)) + + # ── /steer injection ────────────────────────────────────────────── + # Append any pending user steer text to the last tool result so the + # agent sees it on its next iteration. Runs AFTER budget enforcement + # so the steer marker is never truncated. See steer() for details. + if num_tools > 0: + agent._apply_pending_steer_to_tool_results(messages, num_tools) + + + +def execute_tool_calls_sequential(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None: + """Execute tool calls sequentially (original behavior). Used for single calls or interactive tools.""" + for i, tool_call in enumerate(assistant_message.tool_calls, 1): + # SAFETY: check interrupt BEFORE starting each tool. + # If the user sent "stop" during a previous tool's execution, + # do NOT start any more tools -- skip them all immediately. + if agent._interrupt_requested: + remaining_calls = assistant_message.tool_calls[i-1:] + if remaining_calls: + agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)", force=True) + for skipped_tc in remaining_calls: + skipped_name = skipped_tc.function.name + skip_msg = { + "role": "tool", + "name": skipped_name, + "content": f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]", + "tool_call_id": skipped_tc.id, + } + messages.append(skip_msg) + break + + function_name = tool_call.function.name + + try: + function_args = json.loads(tool_call.function.arguments) + except json.JSONDecodeError as e: + logging.warning(f"Unexpected JSON error after validation: {e}") + function_args = {} + if not isinstance(function_args, dict): + function_args = {} + + # Check plugin hooks for a block directive before executing. + _block_msg: Optional[str] = None + try: + from hermes_cli.plugins import get_pre_tool_call_block_message + _block_msg = get_pre_tool_call_block_message( + function_name, function_args, task_id=effective_task_id or "", + ) + except Exception: + pass + + _guardrail_block_decision: ToolGuardrailDecision | None = None + if _block_msg is None: + guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args) + if not guardrail_decision.allows_execution: + _guardrail_block_decision = guardrail_decision + + _execution_blocked = _block_msg is not None or _guardrail_block_decision is not None + + if _execution_blocked: + # Tool blocked by plugin or guardrail policy — skip counters, + # callbacks, checkpointing, activity mutation, and real execution. + pass + # Reset nudge counters when the relevant tool is actually used + elif function_name == "memory": + agent._turns_since_memory = 0 + elif function_name == "skill_manage": + agent._iters_since_skill = 0 + + if not agent.quiet_mode: + args_str = json.dumps(function_args, ensure_ascii=False) + if agent.verbose_logging: + print(f" 📞 Tool {i}: {function_name}({list(function_args.keys())})") + print(agent._wrap_verbose("Args: ", json.dumps(function_args, indent=2, ensure_ascii=False))) + else: + args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str + print(f" 📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}") + + if not _execution_blocked: + agent._current_tool = function_name + agent._touch_activity(f"executing tool: {function_name}") + + # Set activity callback for long-running tool execution (terminal + # commands, etc.) so the gateway's inactivity monitor doesn't kill + # the agent while a command is running. + if not _execution_blocked: + try: + from tools.environments.base import set_activity_callback + set_activity_callback(agent._touch_activity) + except Exception: + pass + + if not _execution_blocked and agent.tool_progress_callback: + try: + preview = _build_tool_preview(function_name, function_args) + agent.tool_progress_callback("tool.started", function_name, preview, function_args) + except Exception as cb_err: + logging.debug(f"Tool progress callback error: {cb_err}") + + if not _execution_blocked and agent.tool_start_callback: + try: + agent.tool_start_callback(tool_call.id, function_name, function_args) + except Exception as cb_err: + logging.debug(f"Tool start callback error: {cb_err}") + + # Checkpoint: snapshot working dir before file-mutating tools + if not _execution_blocked and function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled: + try: + file_path = function_args.get("path", "") + if file_path: + work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path) + agent._checkpoint_mgr.ensure_checkpoint( + work_dir, f"before {function_name}" + ) + except Exception: + pass # never block tool execution + + # Checkpoint before destructive terminal commands + if not _execution_blocked and function_name == "terminal" and agent._checkpoint_mgr.enabled: + try: + cmd = function_args.get("command", "") + if _is_destructive_command(cmd): + cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd()) + agent._checkpoint_mgr.ensure_checkpoint( + cwd, f"before terminal: {cmd[:60]}" + ) + except Exception: + pass # never block tool execution + + tool_start_time = time.time() + + if _block_msg is not None: + # Tool blocked by plugin policy — return error without executing. + function_result = json.dumps({"error": _block_msg}, ensure_ascii=False) + tool_duration = 0.0 + elif _guardrail_block_decision is not None: + # Tool blocked by tool-loop guardrail — synthesize exactly one + # tool result for the original tool_call_id without executing. + function_result = agent._guardrail_block_result(_guardrail_block_decision) + tool_duration = 0.0 + elif function_name == "todo": + from tools.todo_tool import todo_tool as _todo_tool + function_result = _todo_tool( + todos=function_args.get("todos"), + merge=function_args.get("merge", False), + store=agent._todo_store, + ) + tool_duration = time.time() - tool_start_time + if agent._should_emit_quiet_tool_messages(): + agent._vprint(f" {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}") + elif function_name == "session_search": + session_db = agent._get_session_db_for_recall() + if not session_db: + from hermes_state import format_session_db_unavailable + function_result = json.dumps({"success": False, "error": format_session_db_unavailable()}) + else: + from tools.session_search_tool import session_search as _session_search + function_result = _session_search( + query=function_args.get("query", ""), + role_filter=function_args.get("role_filter"), + limit=function_args.get("limit", 3), + db=session_db, + current_session_id=agent.session_id, + ) + tool_duration = time.time() - tool_start_time + if agent._should_emit_quiet_tool_messages(): + agent._vprint(f" {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}") + elif function_name == "memory": + target = function_args.get("target", "memory") + from tools.memory_tool import memory_tool as _memory_tool + function_result = _memory_tool( + action=function_args.get("action"), + target=target, + content=function_args.get("content"), + old_text=function_args.get("old_text"), + store=agent._memory_store, + ) + # Bridge: notify external memory provider of built-in memory writes + if agent._memory_manager and function_args.get("action") in {"add", "replace"}: + try: + agent._memory_manager.on_memory_write( + function_args.get("action", ""), + target, + function_args.get("content", ""), + metadata=agent._build_memory_write_metadata( + task_id=effective_task_id, + tool_call_id=getattr(tool_call, "id", None), + ), + ) + except Exception: + pass + tool_duration = time.time() - tool_start_time + if agent._should_emit_quiet_tool_messages(): + agent._vprint(f" {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}") + elif function_name == "clarify": + from tools.clarify_tool import clarify_tool as _clarify_tool + function_result = _clarify_tool( + question=function_args.get("question", ""), + choices=function_args.get("choices"), + callback=agent.clarify_callback, + ) + tool_duration = time.time() - tool_start_time + if agent._should_emit_quiet_tool_messages(): + agent._vprint(f" {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}") + elif function_name == "delegate_task": + tasks_arg = function_args.get("tasks") + if tasks_arg and isinstance(tasks_arg, list): + spinner_label = f"🔀 delegating {len(tasks_arg)} tasks" + else: + goal_preview = (function_args.get("goal") or "")[:30] + spinner_label = f"🔀 {goal_preview}" if goal_preview else "🔀 delegating" + spinner = None + if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner(): + face = random.choice(KawaiiSpinner.get_waiting_faces()) + spinner = KawaiiSpinner(f"{face} {spinner_label}", spinner_type='dots', print_fn=agent._print_fn) + spinner.start() + agent._delegate_spinner = spinner + _delegate_result = None + try: + function_result = agent._dispatch_delegate_task(function_args) + _delegate_result = function_result + finally: + agent._delegate_spinner = None + tool_duration = time.time() - tool_start_time + cute_msg = _get_cute_tool_message_impl('delegate_task', function_args, tool_duration, result=_delegate_result) + if spinner: + spinner.stop(cute_msg) + elif agent._should_emit_quiet_tool_messages(): + agent._vprint(f" {cute_msg}") + elif agent._context_engine_tool_names and function_name in agent._context_engine_tool_names: + # Context engine tools (lcm_grep, lcm_describe, lcm_expand, etc.) + spinner = None + if agent._should_emit_quiet_tool_messages(): + face = random.choice(KawaiiSpinner.get_waiting_faces()) + emoji = _get_tool_emoji(function_name) + preview = _build_tool_preview(function_name, function_args) or function_name + spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn) + spinner.start() + _ce_result = None + try: + function_result = agent.context_compressor.handle_tool_call(function_name, function_args, messages=messages) + _ce_result = function_result + except Exception as tool_error: + function_result = json.dumps({"error": f"Context engine tool '{function_name}' failed: {tool_error}"}) + logger.error("context_engine.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True) + finally: + tool_duration = time.time() - tool_start_time + cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_ce_result) + if spinner: + spinner.stop(cute_msg) + elif agent._should_emit_quiet_tool_messages(): + agent._vprint(f" {cute_msg}") + elif agent._memory_manager and agent._memory_manager.has_tool(function_name): + # Memory provider tools (hindsight_retain, honcho_search, etc.) + # These are not in the tool registry — route through MemoryManager. + spinner = None + if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner(): + face = random.choice(KawaiiSpinner.get_waiting_faces()) + emoji = _get_tool_emoji(function_name) + preview = _build_tool_preview(function_name, function_args) or function_name + spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn) + spinner.start() + _mem_result = None + try: + function_result = agent._memory_manager.handle_tool_call(function_name, function_args) + _mem_result = function_result + except Exception as tool_error: + function_result = json.dumps({"error": f"Memory tool '{function_name}' failed: {tool_error}"}) + logger.error("memory_manager.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True) + finally: + tool_duration = time.time() - tool_start_time + cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_mem_result) + if spinner: + spinner.stop(cute_msg) + elif agent._should_emit_quiet_tool_messages(): + agent._vprint(f" {cute_msg}") + elif agent.quiet_mode: + spinner = None + if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner(): + face = random.choice(KawaiiSpinner.get_waiting_faces()) + emoji = _get_tool_emoji(function_name) + preview = _build_tool_preview(function_name, function_args) or function_name + spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn) + spinner.start() + _spinner_result = None + try: + function_result = _ra().handle_function_call( + function_name, function_args, effective_task_id, + tool_call_id=tool_call.id, + session_id=agent.session_id or "", + enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None, + skip_pre_tool_call_hook=True, + ) + _spinner_result = function_result + except Exception as tool_error: + function_result = f"Error executing tool '{function_name}': {tool_error}" + logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True) + finally: + tool_duration = time.time() - tool_start_time + cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result) + if spinner: + spinner.stop(cute_msg) + elif agent._should_emit_quiet_tool_messages(): + agent._vprint(f" {cute_msg}") + else: + try: + function_result = _ra().handle_function_call( + function_name, function_args, effective_task_id, + tool_call_id=tool_call.id, + session_id=agent.session_id or "", + enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None, + skip_pre_tool_call_hook=True, + ) + except Exception as tool_error: + function_result = f"Error executing tool '{function_name}': {tool_error}" + logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True) + tool_duration = time.time() - tool_start_time + + if isinstance(function_result, str): + result_preview = function_result if agent.verbose_logging else ( + function_result[:200] if len(function_result) > 200 else function_result + ) + _result_len = len(function_result) + else: + # Multimodal dict result (_multimodal=True) — not sliceable as string + result_preview = function_result + _result_len = len(str(function_result)) + + # Log tool errors to the persistent error log so [error] tags + # in the UI always have a corresponding detailed entry on disk. + _is_error_result, _ = _detect_tool_failure(function_name, function_result) + if not _execution_blocked: + function_result = agent._append_guardrail_observation( + function_name, + function_args, + function_result, + failed=_is_error_result, + ) + result_preview = function_result if agent.verbose_logging else ( + function_result[:200] if len(function_result) > 200 else function_result + ) + if _is_error_result: + logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview) + else: + logger.info("tool %s completed (%.2fs, %d chars)", function_name, tool_duration, _result_len) + + # Track file-mutation outcome for the turn-end verifier. See + # the concurrent path for the rationale; both paths must feed + # the same state so the footer reflects every tool call in the + # turn, not just the parallel ones. + if not _execution_blocked: + try: + agent._record_file_mutation_result( + function_name, function_args, function_result, _is_error_result, + ) + except Exception as _ver_err: + logging.debug("file-mutation verifier record failed: %s", _ver_err) + + if not _execution_blocked and agent.tool_progress_callback: + try: + agent.tool_progress_callback( + "tool.completed", function_name, None, None, + duration=tool_duration, is_error=_is_error_result, + ) + except Exception as cb_err: + logging.debug(f"Tool progress callback error: {cb_err}") + + agent._current_tool = None + agent._touch_activity(f"tool completed: {function_name} ({tool_duration:.1f}s)") + + if agent.verbose_logging: + logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s") + _log_result = _multimodal_text_summary(function_result) + logging.debug(f"Tool result ({len(_log_result)} chars): {_log_result}") + + if not _execution_blocked and agent.tool_complete_callback: + try: + agent.tool_complete_callback(tool_call.id, function_name, function_args, function_result) + except Exception as cb_err: + logging.debug(f"Tool complete callback error: {cb_err}") + + function_result = maybe_persist_tool_result( + content=function_result, + tool_name=function_name, + tool_use_id=tool_call.id, + env=get_active_env(effective_task_id), + ) if not _is_multimodal_tool_result(function_result) else function_result + + # Discover subdirectory context files from tool arguments + subdir_hints = agent._subdirectory_hints.check_tool_call(function_name, function_args) + if subdir_hints: + if _is_multimodal_tool_result(function_result): + _append_subdir_hint_to_multimodal(function_result, subdir_hints) + else: + function_result += subdir_hints + + # Unwrap _multimodal dicts to an OpenAI-style content list + # (see parallel path for rationale). String results pass through. + _tool_content = agent._tool_result_content_for_active_model(function_name, function_result) + tool_msg = { + "role": "tool", + "name": function_name, + "content": _tool_content, + "tool_call_id": tool_call.id + } + messages.append(tool_msg) + + # ── Per-tool /steer drain ─────────────────────────────────── + # Drain pending steer BETWEEN individual tool calls so the + # injection lands as soon as a tool finishes — not after the + # entire batch. The model sees it on the next API iteration. + agent._apply_pending_steer_to_tool_results(messages, 1) + + if not agent.quiet_mode: + if agent.verbose_logging: + print(f" ✅ Tool {i} completed in {tool_duration:.2f}s") + print(agent._wrap_verbose("Result: ", function_result)) + else: + _fr_str = function_result if isinstance(function_result, str) else str(function_result) + response_preview = _fr_str[:agent.log_prefix_chars] + "..." if len(_fr_str) > agent.log_prefix_chars else _fr_str + print(f" ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}") + + if agent._interrupt_requested and i < len(assistant_message.tool_calls): + remaining = len(assistant_message.tool_calls) - i + agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)", force=True) + for skipped_tc in assistant_message.tool_calls[i:]: + skipped_name = skipped_tc.function.name + skip_msg = { + "role": "tool", + "name": skipped_name, + "content": f"[Tool execution skipped — {skipped_name} was not started. User sent a new message]", + "tool_call_id": skipped_tc.id + } + messages.append(skip_msg) + break + + if agent.tool_delay > 0 and i < len(assistant_message.tool_calls): + time.sleep(agent.tool_delay) + + # ── Per-turn aggregate budget enforcement ───────────────────────── + num_tools_seq = len(assistant_message.tool_calls) + if num_tools_seq > 0: + enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id)) + + # ── /steer injection ────────────────────────────────────────────── + # See _execute_tool_calls_parallel for the rationale. Same hook, + # applied to sequential execution as well. + if num_tools_seq > 0: + agent._apply_pending_steer_to_tool_results(messages, num_tools_seq) + + + + +__all__ = [ + "execute_tool_calls_concurrent", + "execute_tool_calls_sequential", +] diff --git a/run_agent.py b/run_agent.py index 5e0a9ec06ac..8471afccddf 100644 --- a/run_agent.py +++ b/run_agent.py @@ -70,38 +70,20 @@ from pathlib import Path from hermes_constants import get_hermes_home - -_OPENAI_CLS_CACHE: Optional[type] = None +# OpenAI lazy proxy + safe stdio + proxy URL helpers — see agent/process_bootstrap.py. +# `OpenAI` is re-exported here so `patch("run_agent.OpenAI", ...)` in tests works. +from agent.process_bootstrap import ( + OpenAI, + _OpenAIProxy, + _load_openai_cls, + _SafeWriter, + _install_safe_stdio, + _get_proxy_from_env, + _get_proxy_for_base_url, +) +from agent.iteration_budget import IterationBudget -def _load_openai_cls() -> type: - """Import and cache ``openai.OpenAI``.""" - global _OPENAI_CLS_CACHE - if _OPENAI_CLS_CACHE is None: - from openai import OpenAI as _cls - _OPENAI_CLS_CACHE = _cls - return _OPENAI_CLS_CACHE - - -class _OpenAIProxy: - """Module-level proxy that looks like ``openai.OpenAI`` but imports lazily.""" - - __slots__ = () - - def __call__(self, *args, **kwargs): - return _load_openai_cls()(*args, **kwargs) - - def __instancecheck__(self, obj): - return isinstance(obj, _load_openai_cls()) - - def __repr__(self): - return "" - - -OpenAI = _OpenAIProxy() - -# Load .env from ~/.hermes/.env first, then project root as dev fallback. -# User-managed env files should override stale shell exports on restart. from hermes_cli.env_loader import load_hermes_dotenv from hermes_cli.timeouts import ( get_provider_request_timeout, @@ -189,173 +171,41 @@ from agent.trajectory import ( convert_scratchpad_to_think, has_incomplete_scratchpad, save_trajectory as _save_trajectory_to_file, ) +from agent.message_sanitization import ( + _SURROGATE_RE, + _sanitize_surrogates, + _sanitize_structure_surrogates, + _sanitize_messages_surrogates, + _escape_invalid_chars_in_json_strings, + _repair_tool_call_arguments, + _strip_non_ascii, + _sanitize_messages_non_ascii, + _sanitize_tools_non_ascii, + _strip_images_from_messages, + _sanitize_structure_non_ascii, +) +from agent.tool_dispatch_helpers import ( + _NEVER_PARALLEL_TOOLS, + _PARALLEL_SAFE_TOOLS, + _PATH_SCOPED_TOOLS, + _DESTRUCTIVE_PATTERNS, + _REDIRECT_OVERWRITE, + _is_destructive_command, + _should_parallelize_tool_batch, + _extract_parallel_scope_path, + _paths_overlap, + _is_multimodal_tool_result, + _multimodal_text_summary, + _append_subdir_hint_to_multimodal, + _extract_file_mutation_targets, + _extract_error_preview, + _trajectory_normalize_msg, +) from utils import atomic_json_write, base_url_host_matches, base_url_hostname, env_var_enabled, normalize_proxy_url from hermes_cli.config import cfg_get -class _SafeWriter: - """Transparent stdio wrapper that catches OSError/ValueError from broken pipes. - - When hermes-agent runs as a systemd service, Docker container, or headless - daemon, the stdout/stderr pipe can become unavailable (idle timeout, buffer - exhaustion, socket reset). Any print() call then raises - ``OSError: [Errno 5] Input/output error``, which can crash agent setup or - run_conversation() — especially via double-fault when an except handler - also tries to print. - - Additionally, when subagents run in ThreadPoolExecutor threads, the shared - stdout handle can close between thread teardown and cleanup, raising - ``ValueError: I/O operation on closed file`` instead of OSError. - - This wrapper delegates all writes to the underlying stream and silently - catches both OSError and ValueError. It is transparent when the wrapped - stream is healthy. - """ - - __slots__ = ("_inner",) - - def __init__(self, inner): - object.__setattr__(self, "_inner", inner) - - def write(self, data): - try: - return self._inner.write(data) - except (OSError, ValueError): - return len(data) if isinstance(data, str) else 0 - - def flush(self): - try: - self._inner.flush() - except (OSError, ValueError): - pass - - def fileno(self): - return self._inner.fileno() - - def isatty(self): - try: - return self._inner.isatty() - except (OSError, ValueError): - return False - - def __getattr__(self, name): - return getattr(self._inner, name) - - -def _get_proxy_from_env() -> Optional[str]: - """Read proxy URL from environment variables. - - Checks HTTPS_PROXY, HTTP_PROXY, ALL_PROXY (and lowercase variants) in order. - Returns the first valid proxy URL found, or None if no proxy is configured. - """ - for key in ("HTTPS_PROXY", "HTTP_PROXY", "ALL_PROXY", - "https_proxy", "http_proxy", "all_proxy"): - value = os.environ.get(key, "").strip() - if value: - return normalize_proxy_url(value) - return None - - -def _get_proxy_for_base_url(base_url: Optional[str]) -> Optional[str]: - """Return an env-configured proxy unless NO_PROXY excludes this base URL.""" - proxy = _get_proxy_from_env() - if not proxy or not base_url: - return proxy - - host = base_url_hostname(base_url) - if not host: - return proxy - - try: - if urllib.request.proxy_bypass_environment(host): - return None - except Exception: - pass - - return proxy - - -def _install_safe_stdio() -> None: - """Wrap stdout/stderr so best-effort console output cannot crash the agent.""" - for stream_name in ("stdout", "stderr"): - stream = getattr(sys, stream_name, None) - if stream is not None and not isinstance(stream, _SafeWriter): - setattr(sys, stream_name, _SafeWriter(stream)) - - -class IterationBudget: - """Thread-safe iteration counter for an agent. - - Each agent (parent or subagent) gets its own ``IterationBudget``. - The parent's budget is capped at ``max_iterations`` (default 90). - Each subagent gets an independent budget capped at - ``delegation.max_iterations`` (default 50) — this means total - iterations across parent + subagents can exceed the parent's cap. - Users control the per-subagent limit via ``delegation.max_iterations`` - in config.yaml. - - ``execute_code`` (programmatic tool calling) iterations are refunded via - :meth:`refund` so they don't eat into the budget. - """ - - def __init__(self, max_total: int): - self.max_total = max_total - self._used = 0 - self._lock = threading.Lock() - - def consume(self) -> bool: - """Try to consume one iteration. Returns True if allowed.""" - with self._lock: - if self._used >= self.max_total: - return False - self._used += 1 - return True - - def refund(self) -> None: - """Give back one iteration (e.g. for execute_code turns).""" - with self._lock: - if self._used > 0: - self._used -= 1 - - @property - def used(self) -> int: - with self._lock: - return self._used - - @property - def remaining(self) -> int: - with self._lock: - return max(0, self.max_total - self._used) - - -# Tools that must never run concurrently (interactive / user-facing). -# When any of these appear in a batch, we fall back to sequential execution. -_NEVER_PARALLEL_TOOLS = frozenset({"clarify"}) - -# Read-only tools with no shared mutable session state. -_PARALLEL_SAFE_TOOLS = frozenset({ - "ha_get_state", - "ha_list_entities", - "ha_list_services", - "read_file", - "search_files", - "session_search", - "skill_view", - "skills_list", - "vision_analyze", - "web_extract", - "web_search", -}) - -# File tools can run concurrently when they target independent paths. -_PATH_SCOPED_TOOLS = frozenset({"read_file", "write_file", "patch"}) - -# Tools that mutate files on disk. Used by the per-turn verifier that -# surfaces silently-failed file edits so the model can't over-claim success. -# Imported above as `_FILE_MUTATING_TOOLS` from `agent.tool_result_classification`. - -# Maximum number of concurrent worker threads for parallel tool execution. _MAX_TOOL_WORKERS = 8 # Guard so the OpenRouter metadata pre-warm thread is only spawned once per @@ -364,682 +214,6 @@ _MAX_TOOL_WORKERS = 8 # exhaust the system thread limit (RuntimeError: can't start new thread). _openrouter_prewarm_done = threading.Event() -# Patterns that indicate a terminal command may modify/delete files. -_DESTRUCTIVE_PATTERNS = re.compile( - r"""(?:^|\s|&&|\|\||;|`)(?: - rm\s|rmdir\s| - cp\s|install\s| - mv\s| - sed\s+-i| - truncate\s| - dd\s| - shred\s| - git\s+(?:reset|clean|checkout)\s - )""", - re.VERBOSE, -) -# Output redirects that overwrite files (> but not >>) -_REDIRECT_OVERWRITE = re.compile(r'[^>]>[^>]|^>[^>]') - - -def _is_destructive_command(cmd: str) -> bool: - """Heuristic: does this terminal command look like it modifies/deletes files?""" - if not cmd: - return False - if _DESTRUCTIVE_PATTERNS.search(cmd): - return True - if _REDIRECT_OVERWRITE.search(cmd): - return True - return False - - -def _is_mcp_tool_parallel_safe(tool_name: str) -> bool: - """Check if an MCP tool comes from a server with parallel tool calls enabled. - - Lazy-imports from ``tools.mcp_tool`` to avoid circular dependencies. - Returns False if the MCP module is not available. - """ - try: - from tools.mcp_tool import is_mcp_tool_parallel_safe - return is_mcp_tool_parallel_safe(tool_name) - except Exception: - return False - - -def _should_parallelize_tool_batch(tool_calls) -> bool: - """Return True when a tool-call batch is safe to run concurrently.""" - if len(tool_calls) <= 1: - return False - - tool_names = [tc.function.name for tc in tool_calls] - if any(name in _NEVER_PARALLEL_TOOLS for name in tool_names): - return False - - reserved_paths: list[Path] = [] - for tool_call in tool_calls: - tool_name = tool_call.function.name - try: - function_args = json.loads(tool_call.function.arguments) - except Exception: - logging.debug( - "Could not parse args for %s — defaulting to sequential; raw=%s", - tool_name, - tool_call.function.arguments[:200], - ) - return False - if not isinstance(function_args, dict): - logging.debug( - "Non-dict args for %s (%s) — defaulting to sequential", - tool_name, - type(function_args).__name__, - ) - return False - - if tool_name in _PATH_SCOPED_TOOLS: - scoped_path = _extract_parallel_scope_path(tool_name, function_args) - if scoped_path is None: - return False - if any(_paths_overlap(scoped_path, existing) for existing in reserved_paths): - return False - reserved_paths.append(scoped_path) - continue - - if tool_name not in _PARALLEL_SAFE_TOOLS: - # Check if it's an MCP tool from a server that opted into parallel calls. - if not _is_mcp_tool_parallel_safe(tool_name): - return False - - return True - - -def _extract_parallel_scope_path(tool_name: str, function_args: dict) -> Path | None: - """Return the normalized file target for path-scoped tools.""" - if tool_name not in _PATH_SCOPED_TOOLS: - return None - - raw_path = function_args.get("path") - if not isinstance(raw_path, str) or not raw_path.strip(): - return None - - expanded = Path(raw_path).expanduser() - if expanded.is_absolute(): - return Path(os.path.abspath(str(expanded))) - - # Avoid resolve(); the file may not exist yet. - return Path(os.path.abspath(str(Path.cwd() / expanded))) - - -def _paths_overlap(left: Path, right: Path) -> bool: - """Return True when two paths may refer to the same subtree.""" - left_parts = left.parts - right_parts = right.parts - if not left_parts or not right_parts: - # Empty paths shouldn't reach here (guarded upstream), but be safe. - return bool(left_parts) == bool(right_parts) and bool(left_parts) - common_len = min(len(left_parts), len(right_parts)) - return left_parts[:common_len] == right_parts[:common_len] - - - -_SURROGATE_RE = re.compile(r'[\ud800-\udfff]') - - - - -def _is_multimodal_tool_result(value: Any) -> bool: - """True if the value is a multimodal tool result envelope. - - Multimodal handlers (e.g. tools/computer_use) return a dict with - `_multimodal=True`, a `content` key holding OpenAI-style content - parts, and an optional `text_summary` for string-only fallbacks. - """ - return ( - isinstance(value, dict) - and value.get("_multimodal") is True - and isinstance(value.get("content"), list) - ) - - -def _multimodal_text_summary(value: Any) -> str: - """Extract a plain text view of a multimodal tool result. - - Used wherever downstream code needs a string — logging, previews, - persistence size heuristics, fall-back content for providers that - don't support multipart tool messages. - """ - if _is_multimodal_tool_result(value): - if value.get("text_summary"): - return str(value["text_summary"]) - parts = [] - for p in value.get("content") or []: - if isinstance(p, dict) and p.get("type") == "text": - parts.append(str(p.get("text", ""))) - if parts: - return "\n".join(parts) - return "[multimodal tool result]" - if isinstance(value, str): - return value - try: - import json as _json - return _json.dumps(value, default=str) - except Exception: - return str(value) - - -def _append_subdir_hint_to_multimodal(value: Dict[str, Any], hint: str) -> None: - """Mutate a multimodal tool-result envelope to append a subdir hint. - - The hint is added to the first text part so the model sees it; image - parts are left untouched. `text_summary` is also updated for - string-fallback callers. - """ - if not _is_multimodal_tool_result(value): - return - parts = value.get("content") or [] - for p in parts: - if isinstance(p, dict) and p.get("type") == "text": - p["text"] = str(p.get("text", "")) + hint - break - else: - parts.insert(0, {"type": "text", "text": hint}) - value["content"] = parts - if isinstance(value.get("text_summary"), str): - value["text_summary"] = value["text_summary"] + hint - - -def _extract_file_mutation_targets(tool_name: str, args: Dict[str, Any]) -> List[str]: - """Return the file paths a ``write_file`` or ``patch`` call is targeting. - - For ``write_file`` and ``patch`` in replace mode this is just ``args["path"]``. - For ``patch`` in V4A patch mode we parse the patch content for - ``*** Update File:`` / ``*** Add File:`` / ``*** Delete File:`` headers so - the verifier can track each file in a multi-file patch separately. - """ - if tool_name not in _FILE_MUTATING_TOOLS: - return [] - if tool_name == "write_file": - p = args.get("path") - return [str(p)] if p else [] - # tool_name == "patch" - mode = args.get("mode") or "replace" - if mode == "replace": - p = args.get("path") - return [str(p)] if p else [] - if mode == "patch": - body = args.get("patch") or "" - if not isinstance(body, str) or not body: - return [] - import re as _re - paths: List[str] = [] - for _m in _re.finditer( - r'^\*\*\*\s+(?:Update|Add|Delete)\s+File:\s*(.+)$', - body, - _re.MULTILINE, - ): - p = _m.group(1).strip() - if p: - paths.append(p) - return paths - return [] - - -def _extract_error_preview(result: Any, max_len: int = 180) -> str: - """Pull a one-line error summary out of a tool result for footer display.""" - text = _multimodal_text_summary(result) if result is not None else "" - if not isinstance(text, str): - try: - text = str(text) - except Exception: - return "" - # Try to parse JSON and pull the ``error`` field — tool handlers return - # ``{"success": false, "error": "..."}``; raw string wins if parse fails. - stripped = text.strip() - if stripped.startswith("{"): - try: - import json as _json - data = _json.loads(stripped) - if isinstance(data, dict) and isinstance(data.get("error"), str): - text = data["error"] - except Exception: - pass - # Collapse whitespace, trim to max_len. - text = " ".join(text.split()) - if len(text) > max_len: - text = text[: max_len - 1] + "…" - return text - - -def _trajectory_normalize_msg(msg: Dict[str, Any]) -> Dict[str, Any]: - """Strip image blobs from a message for trajectory saving. - - Returns a shallow copy with multimodal tool results replaced by their - text_summary, and image parts in content lists replaced by - `[screenshot]` placeholders. Keeps the message schema otherwise intact. - """ - if not isinstance(msg, dict): - return msg - content = msg.get("content") - if _is_multimodal_tool_result(content): - return {**msg, "content": _multimodal_text_summary(content)} - if isinstance(content, list): - cleaned = [] - for p in content: - if isinstance(p, dict) and p.get("type") in {"image", "image_url", "input_image"}: - cleaned.append({"type": "text", "text": "[screenshot]"}) - else: - cleaned.append(p) - return {**msg, "content": cleaned} - return msg - - -def _sanitize_surrogates(text: str) -> str: - """Replace lone surrogate code points with U+FFFD (replacement character). - - Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the - OpenAI SDK. This is a fast no-op when the text contains no surrogates. - """ - if _SURROGATE_RE.search(text): - return _SURROGATE_RE.sub('\ufffd', text) - return text - - -# _summarize_user_message_for_log is imported from agent.codex_responses_adapter -# (see import block above). Remains importable from run_agent for backward compat. - - -def _sanitize_structure_surrogates(payload: Any) -> bool: - """Replace surrogate code points in nested dict/list payloads in-place. - - Mirror of ``_sanitize_structure_non_ascii`` but for surrogate recovery. - Used to scrub nested structured fields (e.g. ``reasoning_details`` — an - array of dicts with ``summary``/``text`` strings) that flat per-field - checks don't reach. Returns True if any surrogates were replaced. - """ - found = False - - def _walk(node): - nonlocal found - if isinstance(node, dict): - for key, value in node.items(): - if isinstance(value, str): - if _SURROGATE_RE.search(value): - node[key] = _SURROGATE_RE.sub('\ufffd', value) - found = True - elif isinstance(value, (dict, list)): - _walk(value) - elif isinstance(node, list): - for idx, value in enumerate(node): - if isinstance(value, str): - if _SURROGATE_RE.search(value): - node[idx] = _SURROGATE_RE.sub('\ufffd', value) - found = True - elif isinstance(value, (dict, list)): - _walk(value) - - _walk(payload) - return found - - -def _sanitize_messages_surrogates(messages: list) -> bool: - """Sanitize surrogate characters from all string content in a messages list. - - Walks message dicts in-place. Returns True if any surrogates were found - and replaced, False otherwise. Covers content/text, name, tool call - metadata/arguments, AND any additional string or nested structured fields - (``reasoning``, ``reasoning_content``, ``reasoning_details``, etc.) so - retries don't fail on a non-content field. Byte-level reasoning models - (xiaomi/mimo, kimi, glm) can emit lone surrogates in reasoning output - that flow through to ``api_messages["reasoning_content"]`` on the next - turn and crash json.dumps inside the OpenAI SDK. - """ - found = False - for msg in messages: - if not isinstance(msg, dict): - continue - content = msg.get("content") - if isinstance(content, str) and _SURROGATE_RE.search(content): - msg["content"] = _SURROGATE_RE.sub('\ufffd', content) - found = True - elif isinstance(content, list): - for part in content: - if isinstance(part, dict): - text = part.get("text") - if isinstance(text, str) and _SURROGATE_RE.search(text): - part["text"] = _SURROGATE_RE.sub('\ufffd', text) - found = True - name = msg.get("name") - if isinstance(name, str) and _SURROGATE_RE.search(name): - msg["name"] = _SURROGATE_RE.sub('\ufffd', name) - found = True - tool_calls = msg.get("tool_calls") - if isinstance(tool_calls, list): - for tc in tool_calls: - if not isinstance(tc, dict): - continue - tc_id = tc.get("id") - if isinstance(tc_id, str) and _SURROGATE_RE.search(tc_id): - tc["id"] = _SURROGATE_RE.sub('\ufffd', tc_id) - found = True - fn = tc.get("function") - if isinstance(fn, dict): - fn_name = fn.get("name") - if isinstance(fn_name, str) and _SURROGATE_RE.search(fn_name): - fn["name"] = _SURROGATE_RE.sub('\ufffd', fn_name) - found = True - fn_args = fn.get("arguments") - if isinstance(fn_args, str) and _SURROGATE_RE.search(fn_args): - fn["arguments"] = _SURROGATE_RE.sub('\ufffd', fn_args) - found = True - # Walk any additional string / nested fields (reasoning, - # reasoning_content, reasoning_details, etc.) — surrogates from - # byte-level reasoning models (xiaomi/mimo, kimi, glm) can lurk - # in these fields and aren't covered by the per-field checks above. - # Matches _sanitize_messages_non_ascii's coverage (PR #10537). - for key, value in msg.items(): - if key in {"content", "name", "tool_calls", "role"}: - continue - if isinstance(value, str): - if _SURROGATE_RE.search(value): - msg[key] = _SURROGATE_RE.sub('\ufffd', value) - found = True - elif isinstance(value, (dict, list)): - if _sanitize_structure_surrogates(value): - found = True - return found - - -def _escape_invalid_chars_in_json_strings(raw: str) -> str: - """Escape unescaped control chars inside JSON string values. - - Walks the raw JSON character-by-character, tracking whether we are - inside a double-quoted string. Inside strings, replaces literal - control characters (0x00-0x1F) that aren't already part of an escape - sequence with their ``\\uXXXX`` equivalents. Pass-through for everything - else. - - Ported from #12093 — complements the other repair passes in - ``_repair_tool_call_arguments`` when ``json.loads(strict=False)`` is - not enough (e.g. llama.cpp backends that emit literal apostrophes or - tabs alongside other malformations). - """ - out: list[str] = [] - in_string = False - i = 0 - n = len(raw) - while i < n: - ch = raw[i] - if in_string: - if ch == "\\" and i + 1 < n: - # Already-escaped char — pass through as-is - out.append(ch) - out.append(raw[i + 1]) - i += 2 - continue - if ch == '"': - in_string = False - out.append(ch) - elif ord(ch) < 0x20: - out.append(f"\\u{ord(ch):04x}") - else: - out.append(ch) - else: - if ch == '"': - in_string = True - out.append(ch) - i += 1 - return "".join(out) - - -def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str: - """Attempt to repair malformed tool_call argument JSON. - - Models like GLM-5.1 via Ollama can produce truncated JSON, trailing - commas, Python ``None``, etc. The API proxy rejects these with HTTP 400 - "invalid tool call arguments". This function applies common repairs; - if all fail it returns ``"{}"`` so the request succeeds (better than - crashing the session). All repairs are logged at WARNING level. - """ - raw_stripped = raw_args.strip() if isinstance(raw_args, str) else "" - - # Fast-path: empty / whitespace-only -> empty object - if not raw_stripped: - logger.warning("Sanitized empty tool_call arguments for %s", tool_name) - return "{}" - - # Python-literal None -> normalise to {} - if raw_stripped == "None": - logger.warning("Sanitized Python-None tool_call arguments for %s", tool_name) - return "{}" - - # Repair pass 0: llama.cpp backends sometimes emit literal control - # characters (tabs, newlines) inside JSON string values. json.loads - # with strict=False accepts these and lets us re-serialise the - # result into wire-valid JSON without any string surgery. This is - # the most common local-model repair case (#12068). - try: - parsed = json.loads(raw_stripped, strict=False) - reserialised = json.dumps(parsed, separators=(",", ":")) - if reserialised != raw_stripped: - logger.warning( - "Repaired unescaped control chars in tool_call arguments for %s", - tool_name, - ) - return reserialised - except (json.JSONDecodeError, TypeError, ValueError): - pass - - # Attempt common JSON repairs - fixed = raw_stripped - # 1. Strip trailing commas before } or ] - fixed = re.sub(r',\s*([}\]])', r'\1', fixed) - # 2. Close unclosed structures - open_curly = fixed.count('{') - fixed.count('}') - open_bracket = fixed.count('[') - fixed.count(']') - if open_curly > 0: - fixed += '}' * open_curly - if open_bracket > 0: - fixed += ']' * open_bracket - # 3. Remove excess closing braces/brackets (bounded to 50 iterations) - for _ in range(50): - try: - json.loads(fixed) - break - except json.JSONDecodeError: - if fixed.endswith('}') and fixed.count('}') > fixed.count('{'): - fixed = fixed[:-1] - elif fixed.endswith(']') and fixed.count(']') > fixed.count('['): - fixed = fixed[:-1] - else: - break - - try: - json.loads(fixed) - logger.warning( - "Repaired malformed tool_call arguments for %s: %s → %s", - tool_name, raw_stripped[:80], fixed[:80], - ) - return fixed - except json.JSONDecodeError: - pass - - # Repair pass 4: escape unescaped control chars inside JSON strings, - # then retry. Catches cases where strict=False alone fails because - # other malformations are present too. - try: - escaped = _escape_invalid_chars_in_json_strings(fixed) - if escaped != fixed: - json.loads(escaped) - logger.warning( - "Repaired control-char-laced tool_call arguments for %s: %s → %s", - tool_name, raw_stripped[:80], escaped[:80], - ) - return escaped - except (json.JSONDecodeError, TypeError, ValueError): - pass - - # Last resort: replace with empty object so the API request doesn't - # crash the entire session. - logger.warning( - "Unrepairable tool_call arguments for %s — " - "replaced with empty object (was: %s)", - tool_name, raw_stripped[:80], - ) - return "{}" - - -def _strip_non_ascii(text: str) -> str: - """Remove non-ASCII characters, replacing with closest ASCII equivalent or removing. - - Used as a last resort when the system encoding is ASCII and can't handle - any non-ASCII characters (e.g. LANG=C on Chromebooks). - """ - return text.encode('ascii', errors='ignore').decode('ascii') - - -def _sanitize_messages_non_ascii(messages: list) -> bool: - """Strip non-ASCII characters from all string content in a messages list. - - This is a last-resort recovery for systems with ASCII-only encoding - (LANG=C, Chromebooks, minimal containers). Returns True if any - non-ASCII content was found and sanitized. - """ - found = False - for msg in messages: - if not isinstance(msg, dict): - continue - # Sanitize content (string) - content = msg.get("content") - if isinstance(content, str): - sanitized = _strip_non_ascii(content) - if sanitized != content: - msg["content"] = sanitized - found = True - elif isinstance(content, list): - for part in content: - if isinstance(part, dict): - text = part.get("text") - if isinstance(text, str): - sanitized = _strip_non_ascii(text) - if sanitized != text: - part["text"] = sanitized - found = True - # Sanitize name field (can contain non-ASCII in tool results) - name = msg.get("name") - if isinstance(name, str): - sanitized = _strip_non_ascii(name) - if sanitized != name: - msg["name"] = sanitized - found = True - # Sanitize tool_calls - tool_calls = msg.get("tool_calls") - if isinstance(tool_calls, list): - for tc in tool_calls: - if isinstance(tc, dict): - fn = tc.get("function", {}) - if isinstance(fn, dict): - fn_args = fn.get("arguments") - if isinstance(fn_args, str): - sanitized = _strip_non_ascii(fn_args) - if sanitized != fn_args: - fn["arguments"] = sanitized - found = True - # Sanitize any additional top-level string fields (e.g. reasoning_content) - for key, value in msg.items(): - if key in {"content", "name", "tool_calls", "role"}: - continue - if isinstance(value, str): - sanitized = _strip_non_ascii(value) - if sanitized != value: - msg[key] = sanitized - found = True - return found - - -def _sanitize_tools_non_ascii(tools: list) -> bool: - """Strip non-ASCII characters from tool payloads in-place.""" - return _sanitize_structure_non_ascii(tools) - - -def _strip_images_from_messages(messages: list) -> bool: - """Remove image_url content parts from all messages in-place. - - Called when a server signals it does not support images (e.g. - "Only 'text' content type is supported."). Mutates messages so the - next API call sends text only. - - Preserves message alternation invariants: - * ``tool``-role messages whose content was entirely images are replaced - with a plaintext placeholder, NOT deleted — deleting them would leave - the paired ``tool_call_id`` on the prior assistant message unmatched, - which providers reject with HTTP 400. - * Non-tool messages whose content becomes empty are dropped. In - practice this only hits synthetic image-only user messages appended - for attachment delivery; real user turns always include text. - - Returns True if any image parts were removed. - """ - found = False - to_delete = [] - for i, msg in enumerate(messages): - if not isinstance(msg, dict): - continue - content = msg.get("content") - if not isinstance(content, list): - continue - new_parts = [] - for part in content: - if isinstance(part, dict) and part.get("type") in {"image_url", "image", "input_image"}: - found = True - else: - new_parts.append(part) - if len(new_parts) < len(content): - if new_parts: - msg["content"] = new_parts - elif msg.get("role") == "tool": - # Preserve tool_call_id linkage — providers require every - # assistant tool_call to have a matching tool response. - msg["content"] = "[image content removed — server does not support images]" - else: - # Synthetic image-only user/assistant message with no text; - # safe to drop. - to_delete.append(i) - for i in reversed(to_delete): - del messages[i] - return found - - -def _sanitize_structure_non_ascii(payload: Any) -> bool: - """Strip non-ASCII characters from nested dict/list payloads in-place.""" - found = False - - def _walk(node): - nonlocal found - if isinstance(node, dict): - for key, value in node.items(): - if isinstance(value, str): - sanitized = _strip_non_ascii(value) - if sanitized != value: - node[key] = sanitized - found = True - elif isinstance(value, (dict, list)): - _walk(value) - elif isinstance(node, list): - for idx, value in enumerate(node): - if isinstance(value, str): - sanitized = _strip_non_ascii(value) - if sanitized != value: - node[idx] = sanitized - found = True - elif isinstance(value, (dict, list)): - _walk(value) - - _walk(payload) - return found - - - - - # ========================================================================= # Large tool result handler — save oversized output to temp file # ========================================================================= @@ -1239,1331 +413,75 @@ class AIAgent: checkpoint_max_file_size_mb: int = 10, pass_session_id: bool = False, ): - """ - Initialize the AI Agent. - - Args: - base_url (str): Base URL for the model API (optional) - api_key (str): API key for authentication (optional, uses env var if not provided) - provider (str): Provider identifier (optional; used for telemetry/routing hints) - api_mode (str): API mode override: "chat_completions" or "codex_responses" - model (str): Model name to use (default: "anthropic/claude-opus-4.6") - max_iterations (int): Maximum number of tool calling iterations (default: 90) - tool_delay (float): Delay between tool calls in seconds (default: 1.0) - enabled_toolsets (List[str]): Only enable tools from these toolsets (optional) - disabled_toolsets (List[str]): Disable tools from these toolsets (optional) - save_trajectories (bool): Whether to save conversation trajectories to JSONL files (default: False) - verbose_logging (bool): Enable verbose logging for debugging (default: False) - quiet_mode (bool): Suppress progress output for clean CLI experience (default: False) - ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional) - log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 100) - log_prefix (str): Prefix to add to all log messages for identification in parallel processing (default: "") - providers_allowed (List[str]): OpenRouter providers to allow (optional) - providers_ignored (List[str]): OpenRouter providers to ignore (optional) - providers_order (List[str]): OpenRouter providers to try in order (optional) - provider_sort (str): Sort providers by price/throughput/latency (optional) - openrouter_min_coding_score (float): Coding-score floor (0.0-1.0) for the - openrouter/pareto-code router. Only applied when model == "openrouter/pareto-code". - None or empty = let OpenRouter pick the strongest available coder. - session_id (str): Pre-generated session ID for logging (optional, auto-generated if not provided) - tool_progress_callback (callable): Callback function(tool_name, args_preview) for progress notifications - clarify_callback (callable): Callback function(question, choices) -> str for interactive user questions. - Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error. - max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set) - reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking). - If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning. - prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context. - Useful for injecting a few-shot example or priming the model's response style. - Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}] - NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a conversation that ends on an - assistant-role message (400 error). For those models use structured outputs or - output_config.format instead of a trailing-assistant prefill. - platform (str): The interface platform the user is on (e.g. "cli", "telegram", "discord", "whatsapp"). - Used to inject platform-specific formatting hints into the system prompt. - skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules - into the system prompt. Use this for batch processing and data generation to avoid - polluting trajectories with user-specific persona or project instructions. - load_soul_identity (bool): If True, still use ~/.hermes/SOUL.md as the primary - identity even when skip_context_files=True. Project context files from the cwd - remain skipped. - """ - _install_safe_stdio() - - self.model = model - self.max_iterations = max_iterations - # Shared iteration budget — parent creates, children inherit. - # Consumed by every LLM turn across parent + all subagents. - self.iteration_budget = iteration_budget or IterationBudget(max_iterations) - self.tool_delay = tool_delay - self.save_trajectories = save_trajectories - self.verbose_logging = verbose_logging - self.quiet_mode = quiet_mode - self.ephemeral_system_prompt = ephemeral_system_prompt - self.platform = platform # "cli", "telegram", "discord", "whatsapp", etc. - self._user_id = user_id # Platform user identifier (gateway sessions) - self._user_name = user_name - self._chat_id = chat_id - self._chat_name = chat_name - self._chat_type = chat_type - self._thread_id = thread_id - self._gateway_session_key = gateway_session_key # Stable per-chat key (e.g. agent:main:telegram:dm:123) - # Pluggable print function — CLI replaces this with _cprint so that - # raw ANSI status lines are routed through prompt_toolkit's renderer - # instead of going directly to stdout where patch_stdout's StdoutProxy - # would mangle the escape sequences. None = use builtins.print. - self._print_fn = None - self.background_review_callback = None # Optional sync callback for gateway delivery - self.skip_context_files = skip_context_files - self.load_soul_identity = load_soul_identity - self.pass_session_id = pass_session_id - self._credential_pool = credential_pool - self.log_prefix_chars = log_prefix_chars - self.log_prefix = f"{log_prefix} " if log_prefix else "" - # Store effective base URL for feature detection (prompt caching, reasoning, etc.) - self.base_url = base_url or "" - provider_name = provider.strip().lower() if isinstance(provider, str) and provider.strip() else None - self.provider = provider_name or "" - self.acp_command = acp_command or command - self.acp_args = list(acp_args or args or []) - if api_mode in {"chat_completions", "codex_responses", "anthropic_messages", "bedrock_converse", "codex_app_server"}: - self.api_mode = api_mode - elif self.provider == "openai-codex": - self.api_mode = "codex_responses" - elif self.provider in {"xai", "xai-oauth"}: - self.api_mode = "codex_responses" - elif (provider_name is None) and ( - self._base_url_hostname == "chatgpt.com" - and "/backend-api/codex" in self._base_url_lower - ): - self.api_mode = "codex_responses" - self.provider = "openai-codex" - elif (provider_name is None) and self._base_url_hostname == "api.x.ai": - self.api_mode = "codex_responses" - self.provider = "xai" - elif self.provider == "anthropic" or (provider_name is None and self._base_url_hostname == "api.anthropic.com"): - self.api_mode = "anthropic_messages" - self.provider = "anthropic" - elif self._base_url_lower.rstrip("/").endswith("/anthropic"): - # Third-party Anthropic-compatible endpoints (e.g. MiniMax, DashScope) - # use a URL convention ending in /anthropic. Auto-detect these so the - # Anthropic Messages API adapter is used instead of chat completions. - self.api_mode = "anthropic_messages" - elif self.provider == "bedrock" or ( - self._base_url_hostname.startswith("bedrock-runtime.") - and base_url_host_matches(self._base_url_lower, "amazonaws.com") - ): - # AWS Bedrock — auto-detect from provider name or base URL - # (bedrock-runtime..amazonaws.com). - self.api_mode = "bedrock_converse" - else: - self.api_mode = "chat_completions" - - # Eagerly warm the transport cache so import errors surface at init, - # not mid-conversation. Also validates the api_mode is registered. - try: - self._get_transport() - except Exception: - pass # Non-fatal — transport may not exist for all modes yet - - try: - from hermes_cli.model_normalize import ( - _AGGREGATOR_PROVIDERS, - normalize_model_for_provider, - ) - - if self.provider not in _AGGREGATOR_PROVIDERS: - self.model = normalize_model_for_provider(self.model, self.provider) - except Exception: - pass - - # GPT-5.x models usually require the Responses API path, but some - # providers have exceptions (for example Copilot's gpt-5-mini still - # uses chat completions). Also auto-upgrade for direct OpenAI URLs - # (api.openai.com) since all newer tool-calling models prefer - # Responses there. ACP runtimes are excluded: CopilotACPClient - # handles its own routing and does not implement the Responses API - # surface. - # When api_mode was explicitly provided, respect it — the user - # knows what their endpoint supports (#10473). - # Exception: Azure OpenAI serves gpt-5.x on /chat/completions and - # does NOT support the Responses API — skip the upgrade for Azure - # (openai.azure.com), even though it looks OpenAI-compatible. - if ( - api_mode is None - and self.api_mode == "chat_completions" - and self.provider != "copilot-acp" - and not str(self.base_url or "").lower().startswith("acp://copilot") - and not str(self.base_url or "").lower().startswith("acp+tcp://") - and not self._is_azure_openai_url() - and ( - self._is_direct_openai_url() - or self._provider_model_requires_responses_api( - self.model, - provider=self.provider, - ) - ) - ): - self.api_mode = "codex_responses" - # Invalidate the eager-warmed transport cache — api_mode changed - # from chat_completions to codex_responses after the warm at __init__. - if hasattr(self, "_transport_cache"): - self._transport_cache.clear() - - # Pre-warm OpenRouter model metadata cache in a background thread. - # fetch_model_metadata() is cached for 1 hour; this avoids a blocking - # HTTP request on the first API response when pricing is estimated. - # Use a process-level Event so this thread is only spawned once — a new - # AIAgent is created for every gateway request, so without the guard - # each message leaks one OS thread and the process eventually exhausts - # the system thread limit (RuntimeError: can't start new thread). - if (self.provider == "openrouter" or self._is_openrouter_url()) and \ - not _openrouter_prewarm_done.is_set(): - _openrouter_prewarm_done.set() - threading.Thread( - target=fetch_model_metadata, - daemon=True, - name="openrouter-prewarm", - ).start() - - self.tool_progress_callback = tool_progress_callback - self.tool_start_callback = tool_start_callback - self.tool_complete_callback = tool_complete_callback - self.suppress_status_output = False - self.thinking_callback = thinking_callback - self.reasoning_callback = reasoning_callback - self.clarify_callback = clarify_callback - self.step_callback = step_callback - self.stream_delta_callback = stream_delta_callback - self.interim_assistant_callback = interim_assistant_callback - self.status_callback = status_callback - self.tool_gen_callback = tool_gen_callback - - - # Tool execution state — allows _vprint during tool execution - # even when stream consumers are registered (no tokens streaming then) - self._executing_tools = False - self._tool_guardrails = ToolCallGuardrailController() - self._tool_guardrail_halt_decision: ToolGuardrailDecision | None = None - - # Interrupt mechanism for breaking out of tool loops - self._interrupt_requested = False - self._interrupt_message = None # Optional message that triggered interrupt - self._execution_thread_id: int | None = None # Set at run_conversation() start - self._interrupt_thread_signal_pending = False - self._client_lock = threading.RLock() - - # /steer mechanism — inject a user note into the next tool result - # without interrupting the agent. Unlike interrupt(), steer() does - # NOT set _interrupt_requested; it waits for the current tool batch - # to finish naturally, then the drain hook appends the text to the - # last tool result's content so the model sees it on its next - # iteration. Message-role alternation is preserved (we modify an - # existing tool message rather than inserting a new user turn). - self._pending_steer: Optional[str] = None - self._pending_steer_lock = threading.Lock() - - # Concurrent-tool worker thread tracking. `_execute_tool_calls_concurrent` - # runs each tool on its own ThreadPoolExecutor worker — those worker - # threads have tids distinct from `_execution_thread_id`, so - # `_set_interrupt(True, _execution_thread_id)` alone does NOT cause - # `is_interrupted()` inside the worker to return True. Track the - # workers here so `interrupt()` / `clear_interrupt()` can fan out to - # their tids explicitly. - self._tool_worker_threads: set[int] = set() - self._tool_worker_threads_lock = threading.Lock() - - # Subagent delegation state - self._delegate_depth = 0 # 0 = top-level agent, incremented for children - self._active_children = [] # Running child AIAgents (for interrupt propagation) - self._active_children_lock = threading.Lock() - - # Store OpenRouter provider preferences - self.providers_allowed = providers_allowed - self.providers_ignored = providers_ignored - self.providers_order = providers_order - self.provider_sort = provider_sort - self.provider_require_parameters = provider_require_parameters - self.provider_data_collection = provider_data_collection - self.openrouter_min_coding_score = openrouter_min_coding_score - - # Store toolset filtering options - self.enabled_toolsets = enabled_toolsets - self.disabled_toolsets = disabled_toolsets - - # Model response configuration - self.max_tokens = max_tokens # None = use model default - self.reasoning_config = reasoning_config # None = use default (medium for OpenRouter) - self.service_tier = service_tier - self.request_overrides = dict(request_overrides or {}) - self.prefill_messages = prefill_messages or [] # Prefilled conversation turns - self._force_ascii_payload = False - - # Anthropic prompt caching: auto-enabled for Claude models on native - # Anthropic, OpenRouter, and third-party gateways that speak the - # Anthropic protocol (``api_mode == 'anthropic_messages'``). Reduces - # input costs by ~75% on multi-turn conversations. Uses system_and_3 - # strategy (4 breakpoints). See ``_anthropic_prompt_cache_policy`` - # for the layout-vs-transport decision. - self._use_prompt_caching, self._use_native_cache_layout = ( - self._anthropic_prompt_cache_policy() - ) - # Anthropic supports "5m" (default) and "1h" cache TTL tiers. Read from - # config.yaml under prompt_caching.cache_ttl; unknown values keep "5m". - # 1h tier costs 2x on write vs 1.25x for 5m, but amortizes across long - # sessions with >5-minute pauses between turns (#14971). - self._cache_ttl = "5m" - try: - from hermes_cli.config import load_config as _load_pc_cfg - - _pc_cfg = _load_pc_cfg().get("prompt_caching", {}) or {} - _ttl = _pc_cfg.get("cache_ttl", "5m") - if _ttl in {"5m", "1h"}: - self._cache_ttl = _ttl - except Exception: - pass - - # Iteration budget: the LLM is only notified when it actually exhausts - # the iteration budget (api_call_count >= max_iterations). At that - # point we inject ONE message, allow one final API call, and if the - # model doesn't produce a text response, force a user-message asking - # it to summarise. No intermediate pressure warnings — they caused - # models to "give up" prematurely on complex tasks (#7915). - self._budget_exhausted_injected = False - self._budget_grace_call = False - - # Activity tracking — updated on each API call, tool execution, and - # stream chunk. Used by the gateway timeout handler to report what the - # agent was doing when it was killed, and by the "still working" - # notifications to show progress. - self._last_activity_ts: float = time.time() - self._last_activity_desc: str = "initializing" - self._current_tool: str | None = None - self._api_call_count: int = 0 - - # Rate limit tracking — updated from x-ratelimit-* response headers - # after each API call. Accessed by /usage slash command. - self._rate_limit_state: Optional["RateLimitState"] = None - - # OpenRouter response cache hit counter — incremented when - # X-OpenRouter-Cache-Status: HIT is seen in streaming response headers. - self._or_cache_hits: int = 0 - - # Centralized logging — agent.log (INFO+) and errors.log (WARNING+) - # both live under ~/.hermes/logs/. Idempotent, so gateway mode - # (which creates a new AIAgent per message) won't duplicate handlers. - from hermes_logging import setup_logging, setup_verbose_logging - setup_logging(hermes_home=_hermes_home) - - if self.verbose_logging: - setup_verbose_logging() - logger.info("Verbose logging enabled (third-party library logs suppressed)") - elif self.quiet_mode: - # In quiet mode (CLI default), keep console output clean — - # but DO NOT raise per-logger levels. Doing so prevents the - # root logger's file handlers (agent.log, errors.log) from - # ever seeing the records, because Python checks - # logger.isEnabledFor() before handler propagation. We rely - # on the fact that hermes_logging.setup_logging() does not - # install a console StreamHandler in quiet mode — so INFO - # records flow to the file handlers but never reach a - # console. Any future noise reduction belongs at the - # handler level inside hermes_logging.py, not here. - pass - - # Internal stream callback (set during streaming TTS). - # Initialized here so _vprint can reference it before run_conversation. - self._stream_callback = None - # Deferred paragraph break flag — set after tool iterations so a - # single "\n\n" is prepended to the next real text delta. - self._stream_needs_break = False - # Stateful scrubber for spans split across stream - # deltas (#5719). sanitize_context() alone can't survive chunk - # boundaries because the block regex needs both tags in one string. - self._stream_context_scrubber = StreamingContextScrubber() - # Stateful scrubber for reasoning/thinking tags in streamed deltas - # (#17924). Replaces the per-delta _strip_think_blocks regex that - # destroyed downstream state (e.g. MiniMax-M2.7 streaming - # '' as delta1 and 'Let me check' as delta2 — the regex - # erased delta1, so downstream state machines never learned a - # block was open and leaked delta2 as content). - self._stream_think_scrubber = StreamingThinkScrubber() - # Visible assistant text already delivered through live token callbacks - # during the current model response. Used to avoid re-sending the same - # commentary when the provider later returns it as a completed interim - # assistant message. - self._current_streamed_assistant_text = "" - - # Optional current-turn user-message override used when the API-facing - # user message intentionally differs from the persisted transcript - # (e.g. CLI voice mode adds a temporary prefix for the live call only). - self._persist_user_message_idx = None - self._persist_user_message_override = None - - # Cache anthropic image-to-text fallbacks per image payload/URL so a - # single tool loop does not repeatedly re-run auxiliary vision on the - # same image history. - self._anthropic_image_fallback_cache: Dict[str, str] = {} - - # Initialize LLM client via centralized provider router. - # The router handles auth resolution, base URL, headers, and - # Codex/Anthropic wrapping for all known providers. - # raw_codex=True because the main agent needs direct responses.stream() - # access for Codex Responses API streaming. - self._anthropic_client = None - self._is_anthropic_oauth = False - - # Resolve per-provider / per-model request timeout once up front so - # every client construction path below (Anthropic native, OpenAI-wire, - # router-based implicit auth) can apply it consistently. Bedrock - # Claude uses its own timeout path and is not covered here. - _provider_timeout = get_provider_request_timeout(self.provider, self.model) - - if self.api_mode == "anthropic_messages": - from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token - # Bedrock + Claude → use AnthropicBedrock SDK for full feature parity - # (prompt caching, thinking budgets, adaptive thinking). - _is_bedrock_anthropic = self.provider == "bedrock" - if _is_bedrock_anthropic: - from agent.anthropic_adapter import build_anthropic_bedrock_client - _region_match = re.search(r"bedrock-runtime\.([a-z0-9-]+)\.", base_url or "") - _br_region = _region_match.group(1) if _region_match else "us-east-1" - self._bedrock_region = _br_region - self._anthropic_client = build_anthropic_bedrock_client(_br_region) - self._anthropic_api_key = "aws-sdk" - self._anthropic_base_url = base_url - self._is_anthropic_oauth = False - self.api_key = "aws-sdk" - self.client = None - self._client_kwargs = {} - if not self.quiet_mode: - print(f"🤖 AI Agent initialized with model: {self.model} (AWS Bedrock + AnthropicBedrock SDK, {_br_region})") - else: - # Only fall back to ANTHROPIC_TOKEN when the provider is actually Anthropic. - # Other anthropic_messages providers (MiniMax, Alibaba, etc.) must use their own API key. - # Falling back would send Anthropic credentials to third-party endpoints (Fixes #1739, #minimax-401). - _is_native_anthropic = self.provider == "anthropic" - effective_key = (api_key or resolve_anthropic_token() or "") if _is_native_anthropic else (api_key or "") - self.api_key = effective_key - self._anthropic_api_key = effective_key - self._anthropic_base_url = base_url - # Only mark the session as OAuth-authenticated when the token - # genuinely belongs to native Anthropic. Third-party providers - # (MiniMax, Kimi, GLM, LiteLLM proxies) that accept the - # Anthropic protocol must never trip OAuth code paths — doing - # so injects Claude-Code identity headers and system prompts - # that cause 401/403 on their endpoints. Guards #1739 and - # the third-party identity-injection bug. - from agent.anthropic_adapter import _is_oauth_token as _is_oat - self._is_anthropic_oauth = _is_oat(effective_key) if _is_native_anthropic else False - self._anthropic_client = build_anthropic_client(effective_key, base_url, timeout=_provider_timeout) - # No OpenAI client needed for Anthropic mode - self.client = None - self._client_kwargs = {} - if not self.quiet_mode: - print(f"🤖 AI Agent initialized with model: {self.model} (Anthropic native)") - if effective_key and len(effective_key) > 12: - print(f"🔑 Using token: {effective_key[:8]}...{effective_key[-4:]}") - elif self.api_mode == "bedrock_converse": - # AWS Bedrock — uses boto3 directly, no OpenAI client needed. - # Region is extracted from the base_url or defaults to us-east-1. - _region_match = re.search(r"bedrock-runtime\.([a-z0-9-]+)\.", base_url or "") - self._bedrock_region = _region_match.group(1) if _region_match else "us-east-1" - # Guardrail config — read from config.yaml at init time. - self._bedrock_guardrail_config = None - try: - from hermes_cli.config import load_config as _load_br_cfg - _gr = _load_br_cfg().get("bedrock", {}).get("guardrail", {}) - if _gr.get("guardrail_identifier") and _gr.get("guardrail_version"): - self._bedrock_guardrail_config = { - "guardrailIdentifier": _gr["guardrail_identifier"], - "guardrailVersion": _gr["guardrail_version"], - } - if _gr.get("stream_processing_mode"): - self._bedrock_guardrail_config["streamProcessingMode"] = _gr["stream_processing_mode"] - if _gr.get("trace"): - self._bedrock_guardrail_config["trace"] = _gr["trace"] - except Exception: - pass - self.client = None - self._client_kwargs = {} - if not self.quiet_mode: - _gr_label = " + Guardrails" if self._bedrock_guardrail_config else "" - print(f"🤖 AI Agent initialized with model: {self.model} (AWS Bedrock, {self._bedrock_region}{_gr_label})") - else: - if api_key and base_url: - # Explicit credentials from CLI/gateway — construct directly. - # The runtime provider resolver already handled auth for us. - # Extract query params (e.g. Azure api-version) from base_url - # and pass via default_query to prevent loss during SDK URL - # joining (httpx drops query string when joining paths). - _parsed_url = urlparse(base_url) - if _parsed_url.query: - _clean_url = urlunparse(_parsed_url._replace(query="")) - _query_params = { - k: v[0] for k, v in parse_qs(_parsed_url.query).items() - } - client_kwargs = { - "api_key": api_key, - "base_url": _clean_url, - "default_query": _query_params, - } - else: - client_kwargs = {"api_key": api_key, "base_url": base_url} - if _provider_timeout is not None: - client_kwargs["timeout"] = _provider_timeout - if self.provider == "copilot-acp": - client_kwargs["command"] = self.acp_command - client_kwargs["args"] = self.acp_args - effective_base = base_url - if base_url_host_matches(effective_base, "openrouter.ai"): - from agent.auxiliary_client import build_or_headers - client_kwargs["default_headers"] = build_or_headers() - elif base_url_host_matches(effective_base, "integrate.api.nvidia.com"): - from agent.auxiliary_client import build_nvidia_nim_headers - client_kwargs["default_headers"] = build_nvidia_nim_headers(effective_base) - elif base_url_host_matches(effective_base, "api.routermint.com"): - client_kwargs["default_headers"] = _routermint_headers() - elif base_url_host_matches(effective_base, "api.githubcopilot.com"): - from hermes_cli.models import copilot_default_headers - - client_kwargs["default_headers"] = copilot_default_headers() - elif base_url_host_matches(effective_base, "api.kimi.com"): - client_kwargs["default_headers"] = { - "User-Agent": "claude-code/0.1.0", - } - elif base_url_host_matches(effective_base, "portal.qwen.ai"): - client_kwargs["default_headers"] = _qwen_portal_headers() - elif base_url_host_matches(effective_base, "chatgpt.com"): - from agent.auxiliary_client import _codex_cloudflare_headers - client_kwargs["default_headers"] = _codex_cloudflare_headers(api_key) - elif "default_headers" not in client_kwargs: - # Fall back to profile.default_headers for providers that - # declare custom headers (e.g. Vercel AI Gateway attribution, - # Kimi User-Agent on non-kimi.com endpoints). - try: - from providers import get_provider_profile as _gpf - _ph = _gpf(self.provider) - if _ph and _ph.default_headers: - client_kwargs["default_headers"] = dict(_ph.default_headers) - except Exception: - pass - else: - # No explicit creds — use the centralized provider router - from agent.auxiliary_client import resolve_provider_client - _routed_client, _ = resolve_provider_client( - self.provider or "auto", model=self.model, raw_codex=True) - if _routed_client is not None: - client_kwargs = { - "api_key": _routed_client.api_key, - "base_url": str(_routed_client.base_url), - } - if _provider_timeout is not None: - client_kwargs["timeout"] = _provider_timeout - # Preserve provider-specific headers the router set. The - # OpenAI SDK stores caller-provided default_headers in - # _custom_headers; older/mocked clients may expose - # _default_headers instead. - _routed_headers = getattr(_routed_client, "_custom_headers", None) - if not _routed_headers: - _routed_headers = getattr(_routed_client, "_default_headers", None) - if _routed_headers: - client_kwargs["default_headers"] = dict(_routed_headers) - else: - # When the user explicitly chose a non-OpenRouter provider - # but no credentials were found, fail fast with a clear - # message instead of silently routing through OpenRouter. - _explicit = (self.provider or "").strip().lower() - if _explicit and _explicit not in {"auto", "openrouter", "custom"}: - # Look up the actual env var name from the provider - # config — some providers use non-standard names - # (e.g. alibaba → DASHSCOPE_API_KEY, not ALIBABA_API_KEY). - _env_hint = f"{_explicit.upper()}_API_KEY" - try: - from hermes_cli.auth import PROVIDER_REGISTRY - _pcfg = PROVIDER_REGISTRY.get(_explicit) - if _pcfg and _pcfg.api_key_env_vars: - _env_hint = _pcfg.api_key_env_vars[0] - except Exception: - pass - # --- Init-time fallback (#17929) --- - _fb_entries = [] - if isinstance(fallback_model, list): - _fb_entries = [ - f for f in fallback_model - if isinstance(f, dict) and f.get("provider") and f.get("model") - ] - elif isinstance(fallback_model, dict) and fallback_model.get("provider") and fallback_model.get("model"): - _fb_entries = [fallback_model] - _fb_resolved = False - for _fb in _fb_entries: - _fb_explicit_key = (_fb.get("api_key") or "").strip() or None - if not _fb_explicit_key: - _fb_key_env = (_fb.get("key_env") or _fb.get("api_key_env") or "").strip() - if _fb_key_env: - _fb_explicit_key = os.getenv(_fb_key_env, "").strip() or None - _fb_client, _fb_model = resolve_provider_client( - _fb["provider"], model=_fb["model"], raw_codex=True, - explicit_base_url=_fb.get("base_url"), - explicit_api_key=_fb_explicit_key, - ) - if _fb_client is not None: - self.provider = _fb["provider"] - self.model = _fb_model or _fb["model"] - self._fallback_activated = True - client_kwargs = { - "api_key": _fb_client.api_key, - "base_url": str(_fb_client.base_url), - } - if _provider_timeout is not None: - client_kwargs["timeout"] = _provider_timeout - _fb_headers = getattr(_fb_client, "_custom_headers", None) - if not _fb_headers: - _fb_headers = getattr(_fb_client, "_default_headers", None) - if _fb_headers: - client_kwargs["default_headers"] = dict(_fb_headers) - _fb_resolved = True - break - if not _fb_resolved: - raise RuntimeError( - f"Provider '{_explicit}' is set in config.yaml but no API key " - f"was found. Set the {_env_hint} environment " - f"variable, or switch to a different provider with `hermes model`." - ) - if not getattr(self, "_fallback_activated", False): - # No provider configured — reject with a clear message. - raise RuntimeError( - "No LLM provider configured. Run `hermes model` to " - "select a provider, or run `hermes setup` for first-time " - "configuration." - ) - - self._client_kwargs = client_kwargs # stored for rebuilding after interrupt - - # Enable fine-grained tool streaming for Claude on OpenRouter. - # Without this, Anthropic buffers the entire tool call and goes - # silent for minutes while thinking — OpenRouter's upstream proxy - # times out during the silence. The beta header makes Anthropic - # stream tool call arguments token-by-token, keeping the - # connection alive. - _effective_base = str(client_kwargs.get("base_url", "")).lower() - if base_url_host_matches(_effective_base, "openrouter.ai") and "claude" in (self.model or "").lower(): - headers = client_kwargs.get("default_headers") or {} - existing_beta = headers.get("x-anthropic-beta", "") - _FINE_GRAINED = "fine-grained-tool-streaming-2025-05-14" - if _FINE_GRAINED not in existing_beta: - if existing_beta: - headers["x-anthropic-beta"] = f"{existing_beta},{_FINE_GRAINED}" - else: - headers["x-anthropic-beta"] = _FINE_GRAINED - client_kwargs["default_headers"] = headers - - self.api_key = client_kwargs.get("api_key", "") - self.base_url = client_kwargs.get("base_url", self.base_url) - try: - self.client = self._create_openai_client(client_kwargs, reason="agent_init", shared=True) - if not self.quiet_mode: - print(f"🤖 AI Agent initialized with model: {self.model}") - if base_url: - print(f"🔗 Using custom base URL: {base_url}") - # Always show API key info (masked) for debugging auth issues - key_used = client_kwargs.get("api_key", "none") - if key_used and key_used != "dummy-key" and len(key_used) > 12: - print(f"🔑 Using API key: {key_used[:8]}...{key_used[-4:]}") - else: - print(f"⚠️ Warning: API key appears invalid or missing (got: '{key_used[:20] if key_used else 'none'}...')") - except Exception as e: - raise RuntimeError(f"Failed to initialize OpenAI client: {e}") - - # Provider fallback chain — ordered list of backup providers tried - # when the primary is exhausted (rate-limit, overload, connection - # failure). Supports both legacy single-dict ``fallback_model`` and - # new list ``fallback_providers`` format. - if isinstance(fallback_model, list): - self._fallback_chain = [ - f for f in fallback_model - if isinstance(f, dict) and f.get("provider") and f.get("model") - ] - elif isinstance(fallback_model, dict) and fallback_model.get("provider") and fallback_model.get("model"): - self._fallback_chain = [fallback_model] - else: - self._fallback_chain = [] - self._fallback_index = 0 - self._fallback_activated = getattr(self, "_fallback_activated", False) - # Legacy attribute kept for backward compat (tests, external callers) - self._fallback_model = self._fallback_chain[0] if self._fallback_chain else None - if self._fallback_chain and not self.quiet_mode: - if len(self._fallback_chain) == 1: - fb = self._fallback_chain[0] - print(f"🔄 Fallback model: {fb['model']} ({fb['provider']})") - else: - print(f"🔄 Fallback chain ({len(self._fallback_chain)} providers): " + - " → ".join(f"{f['model']} ({f['provider']})" for f in self._fallback_chain)) - - # Get available tools with filtering - self.tools = get_tool_definitions( + """Forwarder — see ``agent.agent_init.init_agent``.""" + from agent.agent_init import init_agent + init_agent( + self, + base_url=base_url, + api_key=api_key, + provider=provider, + api_mode=api_mode, + acp_command=acp_command, + acp_args=acp_args, + command=command, + args=args, + model=model, + max_iterations=max_iterations, + tool_delay=tool_delay, enabled_toolsets=enabled_toolsets, disabled_toolsets=disabled_toolsets, - quiet_mode=self.quiet_mode, + save_trajectories=save_trajectories, + verbose_logging=verbose_logging, + quiet_mode=quiet_mode, + ephemeral_system_prompt=ephemeral_system_prompt, + log_prefix_chars=log_prefix_chars, + log_prefix=log_prefix, + providers_allowed=providers_allowed, + providers_ignored=providers_ignored, + providers_order=providers_order, + provider_sort=provider_sort, + provider_require_parameters=provider_require_parameters, + provider_data_collection=provider_data_collection, + openrouter_min_coding_score=openrouter_min_coding_score, + session_id=session_id, + tool_progress_callback=tool_progress_callback, + tool_start_callback=tool_start_callback, + tool_complete_callback=tool_complete_callback, + thinking_callback=thinking_callback, + reasoning_callback=reasoning_callback, + clarify_callback=clarify_callback, + step_callback=step_callback, + stream_delta_callback=stream_delta_callback, + interim_assistant_callback=interim_assistant_callback, + tool_gen_callback=tool_gen_callback, + status_callback=status_callback, + max_tokens=max_tokens, + reasoning_config=reasoning_config, + service_tier=service_tier, + request_overrides=request_overrides, + prefill_messages=prefill_messages, + platform=platform, + user_id=user_id, + user_name=user_name, + chat_id=chat_id, + chat_name=chat_name, + chat_type=chat_type, + thread_id=thread_id, + gateway_session_key=gateway_session_key, + skip_context_files=skip_context_files, + load_soul_identity=load_soul_identity, + skip_memory=skip_memory, + session_db=session_db, + parent_session_id=parent_session_id, + iteration_budget=iteration_budget, + fallback_model=fallback_model, + credential_pool=credential_pool, + checkpoints_enabled=checkpoints_enabled, + checkpoint_max_snapshots=checkpoint_max_snapshots, + checkpoint_max_total_size_mb=checkpoint_max_total_size_mb, + checkpoint_max_file_size_mb=checkpoint_max_file_size_mb, + pass_session_id=pass_session_id, ) - - # Show tool configuration and store valid tool names for validation - self.valid_tool_names = set() - if self.tools: - self.valid_tool_names = {tool["function"]["name"] for tool in self.tools} - tool_names = sorted(self.valid_tool_names) - if not self.quiet_mode: - print(f"🛠️ Loaded {len(self.tools)} tools: {', '.join(tool_names)}") - - # Show filtering info if applied - if enabled_toolsets: - print(f" ✅ Enabled toolsets: {', '.join(enabled_toolsets)}") - if disabled_toolsets: - print(f" ❌ Disabled toolsets: {', '.join(disabled_toolsets)}") - elif not self.quiet_mode: - print("🛠️ No tools loaded (all tools filtered out or unavailable)") - - # Check tool requirements - if self.tools and not self.quiet_mode: - requirements = check_toolset_requirements() - missing_reqs = [name for name, available in requirements.items() if not available] - if missing_reqs: - print(f"⚠️ Some tools may not work due to missing requirements: {missing_reqs}") - - # Show trajectory saving status - if self.save_trajectories and not self.quiet_mode: - print("📝 Trajectory saving enabled") - - # Show ephemeral system prompt status - if self.ephemeral_system_prompt and not self.quiet_mode: - prompt_preview = self.ephemeral_system_prompt[:60] + "..." if len(self.ephemeral_system_prompt) > 60 else self.ephemeral_system_prompt - print(f"🔒 Ephemeral system prompt: '{prompt_preview}' (not saved to trajectories)") - - # Show prompt caching status - if self._use_prompt_caching and not self.quiet_mode: - if self._use_native_cache_layout and self.provider == "anthropic": - source = "native Anthropic" - elif self._use_native_cache_layout: - source = "Anthropic-compatible endpoint" - else: - source = "Claude via OpenRouter" - print(f"💾 Prompt caching: ENABLED ({source}, {self._cache_ttl} TTL)") - - # Session logging setup - auto-save conversation trajectories for debugging - self.session_start = datetime.now() - if session_id: - # Use provided session ID (e.g., from CLI) - self.session_id = session_id - else: - # Generate a new session ID - timestamp_str = self.session_start.strftime("%Y%m%d_%H%M%S") - short_uuid = uuid.uuid4().hex[:6] - self.session_id = f"{timestamp_str}_{short_uuid}" - - # Expose session ID to tools (terminal, execute_code) so agents can - # reference their own session for --resume commands, cross-session - # coordination, and logging. Uses the ContextVar system from - # session_context.py for concurrency safety (gateway runs multiple - # sessions in one process). Also writes os.environ as fallback for - # CLI mode where ContextVars aren't used. - os.environ["HERMES_SESSION_ID"] = self.session_id - try: - from gateway.session_context import _SESSION_ID - _SESSION_ID.set(self.session_id) - except Exception: - pass # CLI/test mode — ContextVar not needed - - # Session logs go into ~/.hermes/sessions/ alongside gateway sessions - hermes_home = get_hermes_home() - self.logs_dir = hermes_home / "sessions" - self.logs_dir.mkdir(parents=True, exist_ok=True) - self.session_log_file = self.logs_dir / f"session_{self.session_id}.json" - - # Track conversation messages for session logging - self._session_messages: List[Dict[str, Any]] = [] - self._memory_write_origin = "assistant_tool" - self._memory_write_context = "foreground" - - # Cached system prompt -- built once per session, only rebuilt on compression - self._cached_system_prompt: Optional[str] = None - - # Filesystem checkpoint manager (transparent — not a tool) - from tools.checkpoint_manager import CheckpointManager - self._checkpoint_mgr = CheckpointManager( - enabled=checkpoints_enabled, - max_snapshots=checkpoint_max_snapshots, - max_total_size_mb=checkpoint_max_total_size_mb, - max_file_size_mb=checkpoint_max_file_size_mb, - ) - - # SQLite session store (optional -- provided by CLI or gateway) - self._session_db = session_db - self._parent_session_id = parent_session_id - self._last_flushed_db_idx = 0 # tracks DB-write cursor to prevent duplicate writes - self._session_db_created = False # DB row deferred to run_conversation() - self._session_init_model_config = { - "max_iterations": self.max_iterations, - "reasoning_config": reasoning_config, - "max_tokens": max_tokens, - } - - # In-memory todo list for task planning (one per agent/session) - from tools.todo_tool import TodoStore - self._todo_store = TodoStore() - - # Load config once for memory, skills, and compression sections - try: - from hermes_cli.config import load_config as _load_agent_config - _agent_cfg = _load_agent_config() - except Exception: - _agent_cfg = {} - try: - self._tool_guardrails = ToolCallGuardrailController( - ToolCallGuardrailConfig.from_mapping( - _agent_cfg.get("tool_loop_guardrails", {}) - ) - ) - except Exception as _tlg_err: - logger.warning("Tool loop guardrail config ignored: %s", _tlg_err) - # Cache only the derived auxiliary compression context override that is - # needed later by the startup feasibility check. Avoid exposing a - # broad pseudo-public config object on the agent instance. - self._aux_compression_context_length_config = None - - # Persistent memory (MEMORY.md + USER.md) -- loaded from disk - self._memory_store = None - self._memory_enabled = False - self._user_profile_enabled = False - self._memory_nudge_interval = 10 - self._turns_since_memory = 0 - self._iters_since_skill = 0 - if not skip_memory: - try: - mem_config = _agent_cfg.get("memory", {}) - self._memory_enabled = mem_config.get("memory_enabled", False) - self._user_profile_enabled = mem_config.get("user_profile_enabled", False) - self._memory_nudge_interval = int(mem_config.get("nudge_interval", 10)) - if self._memory_enabled or self._user_profile_enabled: - from tools.memory_tool import MemoryStore - self._memory_store = MemoryStore( - memory_char_limit=mem_config.get("memory_char_limit", 2200), - user_char_limit=mem_config.get("user_char_limit", 1375), - ) - self._memory_store.load_from_disk() - except Exception: - pass # Memory is optional -- don't break agent init - - - - # Memory provider plugin (external — one at a time, alongside built-in) - # Reads memory.provider from config to select which plugin to activate. - self._memory_manager = None - if not skip_memory: - try: - _mem_provider_name = mem_config.get("provider", "") if mem_config else "" - - if _mem_provider_name and _mem_provider_name.strip(): - from agent.memory_manager import MemoryManager as _MemoryManager - from plugins.memory import load_memory_provider as _load_mem - self._memory_manager = _MemoryManager() - _mp = _load_mem(_mem_provider_name) - if _mp and _mp.is_available(): - self._memory_manager.add_provider(_mp) - if self._memory_manager.providers: - _init_kwargs = { - "session_id": self.session_id, - "platform": platform or "cli", - "hermes_home": str(get_hermes_home()), - "agent_context": "primary", - } - # Thread session title for memory provider scoping - # (e.g. honcho uses this to derive chat-scoped session keys) - if self._session_db: - try: - _st = self._session_db.get_session_title(self.session_id) - if _st: - _init_kwargs["session_title"] = _st - except Exception: - pass - # Thread gateway user identity for per-user memory scoping - if self._user_id: - _init_kwargs["user_id"] = self._user_id - if self._user_name: - _init_kwargs["user_name"] = self._user_name - if self._chat_id: - _init_kwargs["chat_id"] = self._chat_id - if self._chat_name: - _init_kwargs["chat_name"] = self._chat_name - if self._chat_type: - _init_kwargs["chat_type"] = self._chat_type - if self._thread_id: - _init_kwargs["thread_id"] = self._thread_id - # Thread gateway session key for stable per-chat Honcho session isolation - if self._gateway_session_key: - _init_kwargs["gateway_session_key"] = self._gateway_session_key - # Profile identity for per-profile provider scoping - try: - from hermes_cli.profiles import get_active_profile_name - _profile = get_active_profile_name() - _init_kwargs["agent_identity"] = _profile - _init_kwargs["agent_workspace"] = "hermes" - except Exception: - pass - self._memory_manager.initialize_all(**_init_kwargs) - logger.info("Memory provider '%s' activated", _mem_provider_name) - else: - logger.debug("Memory provider '%s' not found or not available", _mem_provider_name) - self._memory_manager = None - except Exception as _mpe: - logger.warning("Memory provider plugin init failed: %s", _mpe) - self._memory_manager = None - - # Inject memory provider tool schemas into the tool surface. - # Skip tools whose names already exist (plugins may register the - # same tools via ctx.register_tool(), which lands in self.tools - # through get_tool_definitions()). Duplicate function names cause - # 400 errors on providers that enforce unique names (e.g. Xiaomi - # MiMo via Nous Portal). - if self._memory_manager and self.tools is not None: - _existing_tool_names = { - t.get("function", {}).get("name") - for t in self.tools - if isinstance(t, dict) - } - for _schema in self._memory_manager.get_all_tool_schemas(): - _tname = _schema.get("name", "") - if _tname and _tname in _existing_tool_names: - continue # already registered via plugin path - _wrapped = {"type": "function", "function": _schema} - self.tools.append(_wrapped) - if _tname: - self.valid_tool_names.add(_tname) - _existing_tool_names.add(_tname) - - # Skills config: nudge interval for skill creation reminders - self._skill_nudge_interval = 10 - try: - skills_config = _agent_cfg.get("skills", {}) - self._skill_nudge_interval = int(skills_config.get("creation_nudge_interval", 10)) - except Exception: - pass - - # Tool-use enforcement config: "auto" (default — matches hardcoded - # model list), true (always), false (never), or list of substrings. - _agent_section = _agent_cfg.get("agent", {}) - if not isinstance(_agent_section, dict): - _agent_section = {} - self._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto") - - # App-level API retry count (wraps each model API call). Default 3, - # overridable via agent.api_max_retries in config.yaml. See #11616. - try: - _raw_api_retries = _agent_section.get("api_max_retries", 3) - _api_retries = int(_raw_api_retries) - _api_retries = max(_api_retries, 1) # 1 = no retry (single attempt) - except (TypeError, ValueError): - _api_retries = 3 - self._api_max_retries = _api_retries - - # Initialize context compressor for automatic context management - # Compresses conversation when approaching model's context limit - # Configuration via config.yaml (compression section) - _compression_cfg = _agent_cfg.get("compression", {}) - if not isinstance(_compression_cfg, dict): - _compression_cfg = {} - compression_threshold = float(_compression_cfg.get("threshold", 0.50)) - try: - from agent.auxiliary_client import _compression_threshold_for_model as _cthresh_fn - _model_cthresh = _cthresh_fn(self.model) - if _model_cthresh is not None: - compression_threshold = _model_cthresh - except Exception: - pass - compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in {"true", "1", "yes"} - compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20)) - compression_protect_last = int(_compression_cfg.get("protect_last_n", 20)) - # protect_first_n is the number of non-system messages to protect at - # the head, in addition to the system prompt (which is always - # implicitly protected by the compressor). Floor at 0 — a value of - # 0 means "preserve only the system prompt + summary + tail", which - # is a legitimate (and common) configuration for long-running - # rolling-compaction sessions. - compression_protect_first = max( - 0, int(_compression_cfg.get("protect_first_n", 3)) - ) - - # Read optional explicit context_length override for the auxiliary - # compression model. Custom endpoints often cannot report this via - # /models, so the startup feasibility check needs the config hint. - try: - _aux_cfg = cfg_get(_agent_cfg, "auxiliary", "compression", default={}) - except Exception: - _aux_cfg = {} - if isinstance(_aux_cfg, dict): - _aux_context_config = _aux_cfg.get("context_length") - else: - _aux_context_config = None - if _aux_context_config is not None: - try: - _aux_context_config = int(_aux_context_config) - except (TypeError, ValueError): - _aux_context_config = None - self._aux_compression_context_length_config = _aux_context_config - - # Read explicit model output-token override from config when the - # caller did not pass one directly. - _model_cfg = _agent_cfg.get("model", {}) - if self.max_tokens is None and isinstance(_model_cfg, dict): - _config_max_tokens = _model_cfg.get("max_tokens") - if _config_max_tokens is not None: - try: - if isinstance(_config_max_tokens, bool): - raise ValueError - _parsed_max_tokens = int(_config_max_tokens) - if _parsed_max_tokens <= 0: - raise ValueError - self.max_tokens = _parsed_max_tokens - except (TypeError, ValueError): - logger.warning( - "Invalid model.max_tokens in config.yaml: %r — " - "must be a positive integer (e.g. 4096). " - "Falling back to provider default.", - _config_max_tokens, - ) - print( - f"\n⚠ Invalid model.max_tokens in config.yaml: {_config_max_tokens!r}\n" - f" Must be a positive integer (e.g. 4096).\n" - f" Falling back to provider default.\n", - file=sys.stderr, - ) - self._session_init_model_config["max_tokens"] = self.max_tokens - - # Read explicit context_length override from model config - if isinstance(_model_cfg, dict): - _config_context_length = _model_cfg.get("context_length") - else: - _config_context_length = None - if _config_context_length is not None: - try: - _config_context_length = int(_config_context_length) - except (TypeError, ValueError): - logger.warning( - "Invalid model.context_length in config.yaml: %r — " - "must be a plain integer (e.g. 256000, not '256K'). " - "Falling back to auto-detection.", - _config_context_length, - ) - print( - f"\n⚠ Invalid model.context_length in config.yaml: {_config_context_length!r}\n" - f" Must be a plain integer (e.g. 256000, not '256K').\n" - f" Falling back to auto-detected context window.\n", - file=sys.stderr, - ) - _config_context_length = None - - # Resolve custom_providers list once for reuse below (startup - # context-length override and plugin context-engine init). - try: - from hermes_cli.config import get_compatible_custom_providers - _custom_providers = get_compatible_custom_providers(_agent_cfg) - except Exception: - _custom_providers = _agent_cfg.get("custom_providers") - if not isinstance(_custom_providers, list): - _custom_providers = [] - - # Store for reuse by _check_compression_model_feasibility (auxiliary - # compression model context-length detection needs the same list). - self._custom_providers = _custom_providers - - # Check custom_providers per-model context_length - if _config_context_length is None and _custom_providers: - try: - from hermes_cli.config import get_custom_provider_context_length - _cp_ctx_resolved = get_custom_provider_context_length( - model=self.model, - base_url=self.base_url, - custom_providers=_custom_providers, - ) - if _cp_ctx_resolved: - _config_context_length = int(_cp_ctx_resolved) - except Exception: - _cp_ctx_resolved = None - - # Surface a clear warning if the user set a context_length but it - # wasn't a valid positive int — the helper silently skips those. - if _config_context_length is None: - _target = self.base_url.rstrip("/") if self.base_url else "" - for _cp_entry in _custom_providers: - if not isinstance(_cp_entry, dict): - continue - _cp_url = (_cp_entry.get("base_url") or "").rstrip("/") - if _target and _cp_url == _target: - _cp_models = _cp_entry.get("models", {}) - if isinstance(_cp_models, dict): - _cp_model_cfg = _cp_models.get(self.model, {}) - if isinstance(_cp_model_cfg, dict): - _cp_ctx = _cp_model_cfg.get("context_length") - if _cp_ctx is not None: - try: - _parsed = int(_cp_ctx) - if _parsed <= 0: - raise ValueError - except (TypeError, ValueError): - logger.warning( - "Invalid context_length for model %r in " - "custom_providers: %r — must be a positive " - "integer (e.g. 256000, not '256K'). " - "Falling back to auto-detection.", - self.model, _cp_ctx, - ) - print( - f"\n⚠ Invalid context_length for model {self.model!r} in custom_providers: {_cp_ctx!r}\n" - f" Must be a positive integer (e.g. 256000, not '256K').\n" - f" Falling back to auto-detected context window.\n", - file=sys.stderr, - ) - break - - # Persist for reuse on switch_model / fallback activation. Must come - # AFTER the custom_providers branch so per-model overrides aren't lost. - self._config_context_length = _config_context_length - - self._ensure_lmstudio_runtime_loaded(_config_context_length) - - - - # Select context engine: config-driven (like memory providers). - # 1. Check config.yaml context.engine setting - # 2. Check plugins/context_engine// directory (repo-shipped) - # 3. Check general plugin system (user-installed plugins) - # 4. Fall back to built-in ContextCompressor - _selected_engine = None - _engine_name = "compressor" # default - try: - _ctx_cfg = _agent_cfg.get("context", {}) if isinstance(_agent_cfg, dict) else {} - _engine_name = _ctx_cfg.get("engine", "compressor") or "compressor" - except Exception: - pass - - if _engine_name != "compressor": - # Try loading from plugins/context_engine// - try: - from plugins.context_engine import load_context_engine - _selected_engine = load_context_engine(_engine_name) - except Exception as _ce_load_err: - logger.debug("Context engine load from plugins/context_engine/: %s", _ce_load_err) - - # Try general plugin system as fallback - if _selected_engine is None: - try: - from hermes_cli.plugins import get_plugin_context_engine - _candidate = get_plugin_context_engine() - if _candidate and _candidate.name == _engine_name: - _selected_engine = _candidate - except Exception: - pass - - if _selected_engine is None: - logger.warning( - "Context engine '%s' not found — falling back to built-in compressor", - _engine_name, - ) - # else: config says "compressor" — use built-in, don't auto-activate plugins - - if _selected_engine is not None: - self.context_compressor = _selected_engine - # Resolve context_length for plugin engines — mirrors switch_model() path - from agent.model_metadata import get_model_context_length - _plugin_ctx_len = get_model_context_length( - self.model, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - config_context_length=_config_context_length, - provider=self.provider, - custom_providers=_custom_providers, - ) - self.context_compressor.update_model( - model=self.model, - context_length=_plugin_ctx_len, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - provider=self.provider, - ) - if not self.quiet_mode: - logger.info("Using context engine: %s", _selected_engine.name) - else: - self.context_compressor = ContextCompressor( - model=self.model, - threshold_percent=compression_threshold, - protect_first_n=compression_protect_first, - protect_last_n=compression_protect_last, - summary_target_ratio=compression_target_ratio, - summary_model_override=None, - quiet_mode=self.quiet_mode, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - config_context_length=_config_context_length, - provider=self.provider, - api_mode=self.api_mode, - ) - self.compression_enabled = compression_enabled - - # Reject models whose context window is below the minimum required - # for reliable tool-calling workflows (64K tokens). - from agent.model_metadata import MINIMUM_CONTEXT_LENGTH - _ctx = getattr(self.context_compressor, "context_length", 0) - if _ctx and _ctx < MINIMUM_CONTEXT_LENGTH: - raise ValueError( - f"Model {self.model} has a context window of {_ctx:,} tokens, " - f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required " - f"by Hermes Agent. Choose a model with at least " - f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set " - f"model.context_length in config.yaml to override." - ) - - # Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand). - # Skip names that are already present — the get_tool_definitions() - # quiet_mode cache returned a shared list pre-#17335, so a stray - # mutation here would poison subsequent agent inits in the same - # Gateway process and trip provider-side 'duplicate tool name' - # errors. Even with the cache fix, dedup is the right defense - # against plugin paths that may register the same schemas via - # ctx.register_tool(). Mirrors the memory tools dedup above. - self._context_engine_tool_names: set = set() - if hasattr(self, "context_compressor") and self.context_compressor and self.tools is not None: - _existing_tool_names = { - t.get("function", {}).get("name") - for t in self.tools - if isinstance(t, dict) - } - for _schema in self.context_compressor.get_tool_schemas(): - _tname = _schema.get("name", "") - if _tname and _tname in _existing_tool_names: - continue # already registered via plugin/cache path - _wrapped = {"type": "function", "function": _schema} - self.tools.append(_wrapped) - if _tname: - self.valid_tool_names.add(_tname) - self._context_engine_tool_names.add(_tname) - _existing_tool_names.add(_tname) - - # Notify context engine of session start - if hasattr(self, "context_compressor") and self.context_compressor: - try: - self.context_compressor.on_session_start( - self.session_id, - hermes_home=str(get_hermes_home()), - platform=self.platform or "cli", - model=self.model, - context_length=getattr(self.context_compressor, "context_length", 0), - ) - except Exception as _ce_err: - logger.debug("Context engine on_session_start: %s", _ce_err) - - self._subdirectory_hints = SubdirectoryHintTracker( - working_dir=os.getenv("TERMINAL_CWD") or None, - ) - self._user_turn_count = 0 - - # Cumulative token usage for the session - self.session_prompt_tokens = 0 - self.session_completion_tokens = 0 - self.session_total_tokens = 0 - self.session_api_calls = 0 - self.session_input_tokens = 0 - self.session_output_tokens = 0 - self.session_cache_read_tokens = 0 - self.session_cache_write_tokens = 0 - self.session_reasoning_tokens = 0 - self.session_estimated_cost_usd = 0.0 - self.session_cost_status = "unknown" - self.session_cost_source = "none" - - # ── Ollama num_ctx injection ── - # Ollama defaults to 2048 context regardless of the model's capabilities. - # When running against an Ollama server, detect the model's max context - # and pass num_ctx on every chat request so the full window is used. - # User override: set model.ollama_num_ctx in config.yaml to cap VRAM use. - # If model.context_length is set, it caps num_ctx so the user's VRAM - # budget is respected even when GGUF metadata advertises a larger window. - self._ollama_num_ctx: int | None = None - _ollama_num_ctx_override = None - if isinstance(_model_cfg, dict): - _ollama_num_ctx_override = _model_cfg.get("ollama_num_ctx") - if _ollama_num_ctx_override is not None: - try: - self._ollama_num_ctx = int(_ollama_num_ctx_override) - except (TypeError, ValueError): - logger.debug("Invalid ollama_num_ctx config value: %r", _ollama_num_ctx_override) - if self._ollama_num_ctx is None and self.base_url and is_local_endpoint(self.base_url): - try: - _detected = query_ollama_num_ctx(self.model, self.base_url, api_key=self.api_key or "") - if _detected and _detected > 0: - self._ollama_num_ctx = _detected - except Exception as exc: - logger.debug("Ollama num_ctx detection failed: %s", exc) - # Cap auto-detected ollama_num_ctx to the user's explicit context_length. - # Without this, GGUF metadata can advertise 256K+ which Ollama honours - # by allocating that much VRAM — blowing up small GPUs even though the - # user explicitly set a smaller context_length in config.yaml. - if ( - self._ollama_num_ctx - and _config_context_length - and _ollama_num_ctx_override is None # don't override explicit ollama_num_ctx - and self._ollama_num_ctx > _config_context_length - ): - logger.info( - "Ollama num_ctx capped: %d -> %d (model.context_length override)", - self._ollama_num_ctx, _config_context_length, - ) - self._ollama_num_ctx = _config_context_length - if self._ollama_num_ctx and not self.quiet_mode: - logger.info( - "Ollama num_ctx: will request %d tokens (model max from /api/show)", - self._ollama_num_ctx, - ) - - if not self.quiet_mode: - if compression_enabled: - print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (compress at {int(compression_threshold*100)}% = {self.context_compressor.threshold_tokens:,})") - else: - print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)") - - # Check immediately so CLI users see the warning at startup. - # Gateway status_callback is not yet wired, so any warning is stored - # in _compression_warning and replayed in the first run_conversation(). - self._compression_warning = None - self._check_compression_model_feasibility() - - # Snapshot primary runtime for per-turn restoration. When fallback - # activates during a turn, the next turn restores these values so the - # preferred model gets a fresh attempt each time. Uses a single dict - # so new state fields are easy to add without N individual attributes. - _cc = self.context_compressor - self._primary_runtime = { - "model": self.model, - "provider": self.provider, - "base_url": self.base_url, - "api_mode": self.api_mode, - "api_key": getattr(self, "api_key", ""), - "client_kwargs": dict(self._client_kwargs), - "use_prompt_caching": self._use_prompt_caching, - "use_native_cache_layout": self._use_native_cache_layout, - # Context engine state that _try_activate_fallback() overwrites. - # Use getattr for model/base_url/api_key/provider since plugin - # engines may not have these (they're ContextCompressor-specific). - "compressor_model": getattr(_cc, "model", self.model), - "compressor_base_url": getattr(_cc, "base_url", self.base_url), - "compressor_api_key": getattr(_cc, "api_key", ""), - "compressor_provider": getattr(_cc, "provider", self.provider), - "compressor_context_length": _cc.context_length, - "compressor_threshold_tokens": _cc.threshold_tokens, - } - if self.api_mode == "anthropic_messages": - self._primary_runtime.update({ - "anthropic_api_key": self._anthropic_api_key, - "anthropic_base_url": self._anthropic_base_url, - "is_anthropic_oauth": self._is_anthropic_oauth, - }) def _get_session_db_for_recall(self): """Return a SessionDB for recall, lazily creating it if an entrypoint forgot. @@ -2679,198 +597,9 @@ class AIAgent: logger.debug("LM Studio preload skipped: %s", err) def switch_model(self, new_model, new_provider, api_key='', base_url='', api_mode=''): - """Switch the model/provider in-place for a live agent. - - Called by the /model command handlers (CLI and gateway) after - ``model_switch.switch_model()`` has resolved credentials and - validated the model. This method performs the actual runtime - swap: rebuilding clients, updating caching flags, and refreshing - the context compressor. - - The implementation mirrors ``_try_activate_fallback()`` for the - client-swap logic but also updates ``_primary_runtime`` so the - change persists across turns (unlike fallback which is - turn-scoped). - """ - from hermes_cli.providers import determine_api_mode - - # ── Determine api_mode if not provided ── - if not api_mode: - api_mode = determine_api_mode(new_provider, base_url) - - # Defense-in-depth: ensure OpenCode base_url doesn't carry a trailing - # /v1 into the anthropic_messages client, which would cause the SDK to - # hit /v1/v1/messages. `model_switch.switch_model()` already strips - # this, but we guard here so any direct callers (future code paths, - # tests) can't reintroduce the double-/v1 404 bug. - if ( - api_mode == "anthropic_messages" - and new_provider in {"opencode-zen", "opencode-go"} - and isinstance(base_url, str) - and base_url - ): - base_url = re.sub(r"/v1/?$", "", base_url) - - old_model = self.model - old_provider = self.provider - - # Clear the per-config context_length override so the new model's - # actual context window is resolved via get_model_context_length() - # instead of inheriting the stale value from the previous model. - self._config_context_length = None - - # ── Swap core runtime fields ── - self.model = new_model - self.provider = new_provider - # Use new base_url when provided; only fall back to current when the - # new provider genuinely has no endpoint (e.g. native SDK providers). - # Without this guard the old provider's URL (e.g. Ollama's localhost - # address) would persist silently after switching to a cloud provider - # that returns an empty base_url string. - if base_url: - self.base_url = base_url - self.api_mode = api_mode - # Invalidate transport cache — new api_mode may need a different transport - if hasattr(self, "_transport_cache"): - self._transport_cache.clear() - if api_key: - self.api_key = api_key - - # ── Build new client ── - if api_mode == "anthropic_messages": - from agent.anthropic_adapter import ( - build_anthropic_client, - resolve_anthropic_token, - _is_oauth_token, - ) - # Only fall back to ANTHROPIC_TOKEN when the provider is actually Anthropic. - # Other anthropic_messages providers (MiniMax, Alibaba, etc.) must use their own - # API key — falling back would send Anthropic credentials to third-party endpoints. - _is_native_anthropic = new_provider == "anthropic" - effective_key = (api_key or self.api_key or resolve_anthropic_token() or "") if _is_native_anthropic else (api_key or self.api_key or "") - self.api_key = effective_key - self._anthropic_api_key = effective_key - self._anthropic_base_url = base_url or getattr(self, "_anthropic_base_url", None) - self._anthropic_client = build_anthropic_client( - effective_key, self._anthropic_base_url, - timeout=get_provider_request_timeout(self.provider, self.model), - ) - self._is_anthropic_oauth = _is_oauth_token(effective_key) if _is_native_anthropic else False - self.client = None - self._client_kwargs = {} - else: - effective_key = api_key or self.api_key - effective_base = base_url or self.base_url - self._client_kwargs = { - "api_key": effective_key, - "base_url": effective_base, - } - _sm_timeout = get_provider_request_timeout(self.provider, self.model) - if _sm_timeout is not None: - self._client_kwargs["timeout"] = _sm_timeout - self.client = self._create_openai_client( - dict(self._client_kwargs), - reason="switch_model", - shared=True, - ) - - # ── Re-evaluate prompt caching ── - self._use_prompt_caching, self._use_native_cache_layout = ( - self._anthropic_prompt_cache_policy( - provider=new_provider, - base_url=self.base_url, - api_mode=api_mode, - model=new_model, - ) - ) - - # ── LM Studio: preload before probing context length ── - self._ensure_lmstudio_runtime_loaded() - - # ── Update context compressor ── - if hasattr(self, "context_compressor") and self.context_compressor: - from agent.model_metadata import get_model_context_length - # Re-read custom_providers from live config so per-model - # context_length overrides are honored when switching to a - # custom provider mid-session (closes #15779). - _sm_custom_providers = None - try: - from hermes_cli.config import load_config, get_compatible_custom_providers - _sm_cfg = load_config() - _sm_custom_providers = get_compatible_custom_providers(_sm_cfg) - except Exception: - _sm_custom_providers = None - new_context_length = get_model_context_length( - self.model, - base_url=self.base_url, - api_key=self.api_key, - provider=self.provider, - config_context_length=getattr(self, "_config_context_length", None), - custom_providers=_sm_custom_providers, - ) - self.context_compressor.update_model( - model=self.model, - context_length=new_context_length, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - provider=self.provider, - api_mode=self.api_mode, - ) - - # ── Invalidate cached system prompt so it rebuilds next turn ── - self._cached_system_prompt = None - - # ── Update _primary_runtime so the change persists across turns ── - _cc = self.context_compressor if hasattr(self, "context_compressor") and self.context_compressor else None - self._primary_runtime = { - "model": self.model, - "provider": self.provider, - "base_url": self.base_url, - "api_mode": self.api_mode, - "api_key": getattr(self, "api_key", ""), - "client_kwargs": dict(self._client_kwargs), - "use_prompt_caching": self._use_prompt_caching, - "use_native_cache_layout": self._use_native_cache_layout, - "compressor_model": getattr(_cc, "model", self.model) if _cc else self.model, - "compressor_base_url": getattr(_cc, "base_url", self.base_url) if _cc else self.base_url, - "compressor_api_key": getattr(_cc, "api_key", "") if _cc else "", - "compressor_provider": getattr(_cc, "provider", self.provider) if _cc else self.provider, - "compressor_context_length": _cc.context_length if _cc else 0, - "compressor_threshold_tokens": _cc.threshold_tokens if _cc else 0, - } - if api_mode == "anthropic_messages": - self._primary_runtime.update({ - "anthropic_api_key": self._anthropic_api_key, - "anthropic_base_url": self._anthropic_base_url, - "is_anthropic_oauth": self._is_anthropic_oauth, - }) - - # ── Reset fallback state ── - self._fallback_activated = False - self._fallback_index = 0 - - # When the user deliberately swaps primary providers (e.g. openrouter - # → anthropic), drop any fallback entries that target the OLD primary - # or the NEW one. The chain was seeded from config at agent init for - # the original provider — without pruning, a failed turn on the new - # primary silently re-activates the provider the user just rejected, - # which is exactly what was reported during TUI v2 blitz testing - # ("switched to anthropic, tui keeps trying openrouter"). - old_norm = (old_provider or "").strip().lower() - new_norm = (new_provider or "").strip().lower() - fallback_chain = list(getattr(self, "_fallback_chain", []) or []) - if old_norm and new_norm and old_norm != new_norm: - fallback_chain = [ - entry for entry in fallback_chain - if (entry.get("provider") or "").strip().lower() not in {old_norm, new_norm} - ] - self._fallback_chain = fallback_chain - self._fallback_model = fallback_chain[0] if fallback_chain else None - - logging.info( - "Model switched in-place: %s (%s) -> %s (%s)", - old_model, old_provider, new_model, new_provider, - ) + """Forwarder — see ``agent.agent_runtime_helpers.switch_model``.""" + from agent.agent_runtime_helpers import switch_model + return switch_model(self, new_model, new_provider, api_key, base_url, api_mode) def _safe_print(self, *args, **kwargs): """Print that silently handles broken pipes / closed stdout. @@ -2987,99 +716,28 @@ class AIAgent: except Exception: logger.debug("status_callback error in _emit_warning", exc_info=True) - # Headers we capture from the dying stream's HTTP response so post-mortem - # diagnosis can answer "which CF edge / which OpenRouter downstream - # provider / which request id". Lowercased; httpx returns CIMultiDict. - _STREAM_DIAG_HEADERS = ( - "cf-ray", - "cf-cache-status", - "x-openrouter-provider", - "x-openrouter-model", - "x-openrouter-id", - "x-request-id", - "x-vercel-id", - "via", - "server", - "x-forwarded-for", - ) + # Stream-diagnostic class header preserved for backward compat — + # actual list lives in ``agent.stream_diag.STREAM_DIAG_HEADERS``. + from agent.stream_diag import STREAM_DIAG_HEADERS as _STREAM_DIAG_HEADERS # noqa: E402 @staticmethod def _stream_diag_init() -> Dict[str, Any]: - """Return a fresh per-attempt diagnostic dict. - - Mutated in-place by the streaming functions and read from the retry - block when a stream dies. Lives on ``request_client_holder`` so it - survives across the closure boundary. - """ - return { - "started_at": time.time(), - "first_chunk_at": None, - "chunks": 0, - "bytes": 0, - "headers": {}, - "http_status": None, - } + """Forwarder — see ``agent.stream_diag.stream_diag_init``.""" + from agent.stream_diag import stream_diag_init + return stream_diag_init() def _stream_diag_capture_response( self, diag: Dict[str, Any], http_response: Any ) -> None: - """Snapshot interesting headers + HTTP status from the live stream. - - Called once at stream open (before iterating chunks) so the metadata - survives even if the stream dies before any chunk arrives. Failures - are swallowed — diag is best-effort. - """ - if http_response is None or not isinstance(diag, dict): - return - try: - diag["http_status"] = getattr(http_response, "status_code", None) - except Exception: - pass - try: - headers = getattr(http_response, "headers", None) or {} - captured: Dict[str, str] = {} - for name in self._STREAM_DIAG_HEADERS: - try: - val = headers.get(name) - if val: - # Truncate single-value to keep log lines bounded. - captured[name] = str(val)[:120] - except Exception: - continue - diag["headers"] = captured - except Exception: - pass + """Forwarder — see ``agent.stream_diag.stream_diag_capture_response``.""" + from agent.stream_diag import stream_diag_capture_response + stream_diag_capture_response(self, diag, http_response) @staticmethod def _flatten_exception_chain(error: BaseException) -> str: - """Return a compact ``Outer(msg) <- Inner(msg) <- ...`` rendering. - - OpenAI SDK wraps httpx errors as ``APIConnectionError`` / - ``APIError`` and only the wrapper's class is visible at the catch - site — but the underlying ``RemoteProtocolError`` / - ``ConnectError`` / ``ReadError`` is what tells us WHY the stream - died. Walks ``__cause__`` then ``__context__`` (deduped, max 4 - deep) to surface the chain in one line. - """ - seen: List[BaseException] = [] - link: Optional[BaseException] = error - while link is not None and len(seen) < 4: - if link in seen: - break - seen.append(link) - nxt = getattr(link, "__cause__", None) or getattr( - link, "__context__", None - ) - if nxt is None or nxt is link: - break - link = nxt - parts: List[str] = [] - for e in seen: - msg = str(e).strip().replace("\n", " ") - if len(msg) > 140: - msg = msg[:140] + "…" - parts.append(f"{type(e).__name__}({msg})" if msg else type(e).__name__) - return " <- ".join(parts) if parts else type(error).__name__ + """Forwarder — see ``agent.stream_diag.flatten_exception_chain``.""" + from agent.stream_diag import flatten_exception_chain + return flatten_exception_chain(error) def _is_provider_stream_parse_error(self, error: BaseException) -> bool: """Return True for malformed provider streaming data from SDK parsers. @@ -3109,88 +767,12 @@ class AIAgent: mid_tool_call: bool, diag: Optional[Dict[str, Any]] = None, ) -> None: - """Record a transient stream-drop and retry to ``agent.log``. - - Always logs a structured WARNING so users have a breadcrumb regardless - of UI verbosity. Subagents in particular benefit because their - retries no longer spam the parent's terminal — but the file log keeps - full detail (provider, error class, attempt, base_url, subagent_id). - - When *diag* is provided (the per-attempt stream-diagnostic dict from - ``_stream_diag_init``), the WARNING also captures upstream headers - (cf-ray, x-openrouter-provider, x-openrouter-id), HTTP status, bytes - streamed before the drop, and elapsed time on the dying attempt. - These are the breadcrumbs needed to answer "is one CF edge / one - downstream provider responsible, or is it random across runs?" - """ - try: - try: - _summary = self._summarize_api_error(error) - except Exception: - _summary = str(error) - if _summary and len(_summary) > 240: - _summary = _summary[:240] + "…" - - # Inner-cause chain (httpx errors hide under openai.APIError). - try: - _chain = self._flatten_exception_chain(error) - except Exception: - _chain = type(error).__name__ - - # Per-attempt counters and upstream headers. - _now = time.time() - _bytes = 0 - _chunks = 0 - _elapsed = 0.0 - _ttfb = None - _headers_repr = "-" - _http_status = "-" - if isinstance(diag, dict): - try: - _bytes = int(diag.get("bytes") or 0) - _chunks = int(diag.get("chunks") or 0) - _started = float(diag.get("started_at") or _now) - _elapsed = max(0.0, _now - _started) - _first = diag.get("first_chunk_at") - if _first is not None: - _ttfb = max(0.0, float(_first) - _started) - headers = diag.get("headers") or {} - if isinstance(headers, dict) and headers: - _headers_repr = " ".join( - f"{k}={v}" for k, v in headers.items() - ) - if diag.get("http_status") is not None: - _http_status = str(diag.get("http_status")) - except Exception: - pass - - logger.warning( - "Stream %s on attempt %s/%s — retrying. " - "subagent_id=%s depth=%s provider=%s base_url=%s " - "error_type=%s error=%s " - "chain=%s " - "http_status=%s bytes=%d chunks=%d elapsed=%.2fs ttfb=%s " - "upstream=[%s]", - kind, - attempt, - max_attempts, - getattr(self, "_subagent_id", None) or "-", - getattr(self, "_delegate_depth", 0), - self.provider or "-", - self.base_url or "-", - type(error).__name__, - _summary, - _chain, - _http_status, - _bytes, - _chunks, - _elapsed, - f"{_ttfb:.2f}s" if _ttfb is not None else "-", - _headers_repr, - extra={"mid_tool_call": mid_tool_call}, - ) - except Exception: - logger.debug("stream-retry log emit failed", exc_info=True) + """Forwarder — see ``agent.stream_diag.log_stream_retry``.""" + from agent.stream_diag import log_stream_retry + log_stream_retry( + self, kind=kind, error=error, attempt=attempt, + max_attempts=max_attempts, mid_tool_call=mid_tool_call, diag=diag, + ) def _emit_stream_drop( self, @@ -3201,53 +783,12 @@ class AIAgent: mid_tool_call: bool, diag: Optional[Dict[str, Any]] = None, ) -> None: - """Emit a single user-visible line for a stream drop+retry. - - Both top-level agents and subagents announce drops in the UI — the - parent prefixes subagent lines with ``[subagent-N]`` via ``log_prefix`` - so they're easy to attribute. All cases also write a structured - WARNING to ``agent.log`` via :meth:`_log_stream_retry` with the full - diagnostic detail (subagent_id, provider, base_url, error_type, - cf-ray, x-openrouter-provider, bytes/chunks, elapsed) for post-hoc - analysis. - - The user-visible status line is intentionally compact: provider, - error class, attempt N/M, plus ``after Xs`` when the stream dropped - mid-flight. Full diagnostic detail goes to ``agent.log`` only — - ``hermes logs --level WARNING | grep "Stream drop"`` to inspect. - """ - kind = "drop mid tool-call" if mid_tool_call else "drop" - self._log_stream_retry( - kind=kind, - error=error, - attempt=attempt, - max_attempts=max_attempts, - mid_tool_call=mid_tool_call, - diag=diag, + """Forwarder — see ``agent.stream_diag.emit_stream_drop``.""" + from agent.stream_diag import emit_stream_drop + emit_stream_drop( + self, error=error, attempt=attempt, max_attempts=max_attempts, + mid_tool_call=mid_tool_call, diag=diag, ) - provider = self.provider or "provider" - # Compose a brief "after Xs" suffix when we have timing data — helps - # the user distinguish "couldn't connect" (0s) from "died after 30s - # of streaming" (likely upstream idle-kill or proxy timeout). - _suffix = "" - if isinstance(diag, dict): - try: - started = diag.get("started_at") - if started is not None: - _suffix = f" after {max(0.0, time.time() - float(started)):.1f}s" - except Exception: - pass - try: - self._emit_status( - f"⚠️ {provider} stream {kind} ({type(error).__name__}){_suffix} " - f"— reconnecting, retry {attempt}/{max_attempts}" - ) - self._touch_activity( - f"stream retry {attempt}/{max_attempts} " - f"after {type(error).__name__}" - ) - except Exception: - pass def _emit_auxiliary_failure(self, task: str, exc: BaseException) -> None: """Surface a compact warning for failed auxiliary work.""" @@ -3271,201 +812,14 @@ class AIAgent: } def _check_compression_model_feasibility(self) -> None: - """Warn at session start if the auxiliary compression model's context - window is smaller than the main model's compression threshold. - - When the auxiliary model cannot fit the content that needs summarising, - compression will either fail outright (the LLM call errors) or produce - a severely truncated summary. - - Called during ``__init__`` so CLI users see the warning immediately - (via ``_vprint``). The gateway sets ``status_callback`` *after* - construction, so ``_replay_compression_warning()`` re-sends the - stored warning through the callback on the first - ``run_conversation()`` call. - """ - if not self.compression_enabled: - return - try: - from agent.auxiliary_client import ( - _resolve_task_provider_model, - get_text_auxiliary_client, - ) - from agent.model_metadata import ( - MINIMUM_CONTEXT_LENGTH, - get_model_context_length, - ) - - client, aux_model = get_text_auxiliary_client( - "compression", - main_runtime=self._current_main_runtime(), - ) - # Best-effort aux provider label for the warning message. The - # configured provider may be "auto", in which case we fall back - # to the client's base_url hostname so the user can still tell - # where the compression model is actually being called. - try: - _aux_cfg_provider, _, _, _, _ = _resolve_task_provider_model("compression") - except Exception: - _aux_cfg_provider = "" - if client is None or not aux_model: - if _aux_cfg_provider and _aux_cfg_provider != "auto": - msg = ( - "⚠ Configured auxiliary compression provider " - f"'{_aux_cfg_provider}' is unavailable — context " - "compression will drop middle turns without a summary. " - "Check auxiliary.compression in config.yaml and " - "reauthenticate that provider." - ) - else: - msg = ( - "⚠ No auxiliary LLM provider configured — context " - "compression will drop middle turns without a summary. " - "Run `hermes setup` or set OPENROUTER_API_KEY." - ) - self._compression_warning = msg - self._emit_status(msg) - logger.warning( - "No auxiliary LLM provider for compression — " - "summaries will be unavailable." - ) - return - - aux_base_url = str(getattr(client, "base_url", "")) - aux_api_key = str(getattr(client, "api_key", "")) - - aux_context = get_model_context_length( - aux_model, - base_url=aux_base_url, - api_key=aux_api_key, - config_context_length=getattr(self, "_aux_compression_context_length_config", None), - # Each model must be resolved with its own provider so that - # provider-specific paths (e.g. Bedrock static table, OpenRouter API) - # are invoked for the correct client, not inherited from the main model. - provider=(_aux_cfg_provider if _aux_cfg_provider and _aux_cfg_provider != "auto" else getattr(self, "provider", "")), - custom_providers=self._custom_providers, - ) - - # Hard floor: the auxiliary compression model must have at least - # MINIMUM_CONTEXT_LENGTH (64K) tokens of context. The main model - # is already required to meet this floor (checked earlier in - # __init__), so the compression model must too — otherwise it - # cannot summarise a full threshold-sized window of main-model - # content. Mirrors the main-model rejection pattern. - if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH: - raise ValueError( - f"Auxiliary compression model {aux_model} has a context " - f"window of {aux_context:,} tokens, which is below the " - f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes " - f"Agent. Choose a compression model with at least " - f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set " - f"auxiliary.compression.model in config.yaml), or set " - f"auxiliary.compression.context_length to override the " - f"detected value if it is wrong." - ) - - threshold = self.context_compressor.threshold_tokens - if aux_context < threshold: - # Auto-correct: lower the live session threshold so - # compression actually works this session. The hard floor - # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH, - # so the new threshold is always >= 64K. - # - # The compression summariser sends a single user-role - # prompt (no system prompt, no tools) to the aux model, so - # new_threshold == aux_context is safe: the request is - # the raw messages plus a small summarisation instruction. - old_threshold = threshold - new_threshold = aux_context - self.context_compressor.threshold_tokens = new_threshold - # Keep threshold_percent in sync so future main-model - # context_length changes (update_model) re-derive from a - # sensible number rather than the original too-high value. - main_ctx = self.context_compressor.context_length - if main_ctx: - self.context_compressor.threshold_percent = ( - new_threshold / main_ctx - ) - safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50 - # Build human-readable "model (provider)" labels for both - # the main model and the compression model so users can - # tell at a glance which provider each side is actually - # using. When the configured provider is empty or "auto", - # fall back to the client's base_url hostname. - _main_model = getattr(self, "model", "") or "?" - _main_provider = getattr(self, "provider", "") or "" - _aux_provider_label = ( - _aux_cfg_provider - if _aux_cfg_provider and _aux_cfg_provider != "auto" - else "" - ) - if not _aux_provider_label: - try: - from urllib.parse import urlparse - _aux_provider_label = ( - urlparse(aux_base_url).hostname or aux_base_url - ) - except Exception: - _aux_provider_label = aux_base_url or "auto" - _main_label = ( - f"{_main_model} ({_main_provider})" - if _main_provider - else _main_model - ) - _aux_label = f"{aux_model} ({_aux_provider_label})" - msg = ( - f"⚠ Compression model {_aux_label} context is " - f"{aux_context:,} tokens, but the main model " - f"{_main_label}'s compression threshold was " - f"{old_threshold:,} tokens. " - f"Auto-lowered this session's threshold to " - f"{new_threshold:,} tokens so compression can run.\n" - f" To make this permanent, edit config.yaml — either:\n" - f" 1. Use a larger compression model:\n" - f" auxiliary:\n" - f" compression:\n" - f" model: \n" - f" 2. Lower the compression threshold:\n" - f" compression:\n" - f" threshold: 0.{safe_pct:02d}" - ) - self._compression_warning = msg - self._emit_status(msg) - logger.warning( - "Auxiliary compression model %s has %d token context, " - "below the main model's compression threshold of %d " - "tokens — auto-lowered session threshold to %d to " - "keep compression working.", - aux_model, - aux_context, - old_threshold, - new_threshold, - ) - except ValueError: - # Hard rejections (aux below minimum context) must propagate - # so the session refuses to start. - raise - except Exception as exc: - logger.debug( - "Compression feasibility check failed (non-fatal): %s", exc - ) + """Forwarder — see ``agent.conversation_compression.check_compression_model_feasibility``.""" + from agent.conversation_compression import check_compression_model_feasibility + check_compression_model_feasibility(self) def _replay_compression_warning(self) -> None: - """Re-send the compression warning through ``status_callback``. - - During ``__init__`` the gateway's ``status_callback`` is not yet - wired, so ``_emit_status`` only reaches ``_vprint`` (CLI). This - method is called once at the start of the first - ``run_conversation()`` — by then the gateway has set the callback, - so every platform (Telegram, Discord, Slack, etc.) receives the - warning. - """ - msg = getattr(self, "_compression_warning", None) - if msg and self.status_callback: - try: - self.status_callback("lifecycle", msg) - except Exception: - pass + """Forwarder — see ``agent.conversation_compression.replay_compression_warning``.""" + from agent.conversation_compression import replay_compression_warning + replay_compression_warning(self) def _is_direct_openai_url(self, base_url: str = None) -> bool: """Return True when a base URL targets OpenAI's native API.""" @@ -3573,101 +927,9 @@ class AIAgent: api_mode: Optional[str] = None, model: Optional[str] = None, ) -> tuple[bool, bool]: - """Decide whether to apply Anthropic prompt caching and which layout to use. - - Returns ``(should_cache, use_native_layout)``: - * ``should_cache`` — inject ``cache_control`` breakpoints for this - request (applies to OpenRouter Claude, native Anthropic, and - third-party gateways that speak the native Anthropic protocol). - * ``use_native_layout`` — place markers on the *inner* content - blocks (native Anthropic accepts and requires this layout); - when False markers go on the message envelope (OpenRouter and - OpenAI-wire proxies expect the looser layout). - - Third-party providers using the native Anthropic transport - (``api_mode == 'anthropic_messages'`` + Claude-named model) get - caching with the native layout so they benefit from the same - cost reduction as direct Anthropic callers, provided their - gateway implements the Anthropic cache_control contract - (MiniMax, Zhipu GLM, LiteLLM's Anthropic proxy mode all do). - - Qwen / Alibaba-family models on OpenCode, OpenCode Go, and direct - Alibaba (DashScope) also honour Anthropic-style ``cache_control`` - markers on OpenAI-wire chat completions. Upstream pi-mono #3392 / - pi #3393 documented this for opencode-go Qwen. Without markers - these providers serve zero cache hits, re-billing the full prompt - on every turn. - """ - eff_provider = (provider if provider is not None else self.provider) or "" - eff_base_url = base_url if base_url is not None else (self.base_url or "") - eff_api_mode = api_mode if api_mode is not None else (self.api_mode or "") - eff_model = (model if model is not None else self.model) or "" - - model_lower = eff_model.lower() - provider_lower = eff_provider.lower() - is_claude = "claude" in model_lower - is_openrouter = base_url_host_matches(eff_base_url, "openrouter.ai") - # Nous Portal proxies to OpenRouter behind the scenes — identical - # OpenAI-wire envelope cache_control semantics. Treat it as an - # OpenRouter-equivalent endpoint for caching layout purposes. - is_nous_portal = "nousresearch" in eff_base_url.lower() - is_anthropic_wire = eff_api_mode == "anthropic_messages" - is_native_anthropic = ( - is_anthropic_wire - and (eff_provider == "anthropic" or base_url_hostname(eff_base_url) == "api.anthropic.com") - ) - - if is_native_anthropic: - return True, True - if (is_openrouter or is_nous_portal) and is_claude: - return True, False - # Nous Portal Qwen (e.g. qwen3.6-plus) takes the same envelope-layout - # cache_control path as Portal Claude. Portal proxies to OpenRouter - # and the upstream Qwen route accepts cache_control markers; without - # this branch the alibaba-family check below only matches - # provider=opencode/alibaba and Portal traffic falls through to - # (False, False), serving 0% cache hits and re-billing the full - # prompt on every turn. - if is_nous_portal and "qwen" in model_lower: - return True, False - if is_anthropic_wire and is_claude: - # Third-party Anthropic-compatible gateway. - return True, True - - # MiniMax on its Anthropic-compatible endpoint serves its own - # model family (MiniMax-M2.7, M2.5, M2.1, M2) with documented - # cache_control support (0.1× read pricing, 5-minute TTL). The - # blanket is_claude gate above excludes these — opt them in - # explicitly via provider id or host match so users on - # provider=minimax / minimax-cn (or custom endpoints pointing at - # api.minimax.io/anthropic / api.minimaxi.com/anthropic) get the - # same cost reduction as Claude traffic. - # Docs: https://platform.minimax.io/docs/api-reference/anthropic-api-compatible-cache - if is_anthropic_wire: - is_minimax_provider = provider_lower in {"minimax", "minimax-cn"} - is_minimax_host = ( - base_url_host_matches(eff_base_url, "api.minimax.io") - or base_url_host_matches(eff_base_url, "api.minimaxi.com") - ) - if is_minimax_provider or is_minimax_host: - return True, True - - # Qwen/Alibaba on OpenCode (Zen/Go) and native DashScope: OpenAI-wire - # transport that accepts Anthropic-style cache_control markers and - # rewards them with real cache hits. Without this branch - # qwen3.6-plus on opencode-go reports 0% cached tokens and burns - # through the subscription on every turn. - model_is_qwen = "qwen" in model_lower - provider_is_alibaba_family = provider_lower in { - "opencode", "opencode-zen", "opencode-go", "alibaba", - } - if provider_is_alibaba_family and model_is_qwen: - # Envelope layout (native_anthropic=False): markers on inner - # content parts, not top-level tool messages. Matches - # pi-mono's "alibaba" cacheControlFormat. - return True, False - - return False, False + """Forwarder — see ``agent.agent_runtime_helpers.anthropic_prompt_cache_policy``.""" + from agent.agent_runtime_helpers import anthropic_prompt_cache_policy + return anthropic_prompt_cache_policy(self, provider=provider, base_url=base_url, api_mode=api_mode, model=model) @staticmethod def _model_requires_responses_api(model: str) -> bool: @@ -3743,98 +1005,9 @@ class AIAgent: return bool(cleaned.strip()) def _strip_think_blocks(self, content: str) -> str: - """Remove reasoning/thinking blocks from content, returning only visible text. - - Handles four cases: - 1. Closed tag pairs (````) — the common path when - the provider emits complete reasoning blocks. - 2. Unterminated open tag at a block boundary (start of text or - after a newline) — e.g. MiniMax M2.7 / NIM endpoints where the - closing tag is dropped. Everything from the open tag to end - of string is stripped. The block-boundary check mirrors - ``gateway/stream_consumer.py``'s filter so models that mention - ```` in prose aren't over-stripped. - 3. Stray orphan open/close tags that slip through. - 4. Tag variants: ````, ````, ````, - ````, ```` (Gemma 4), all - case-insensitive. - - Additionally strips standalone tool-call XML blocks that some open - models (notably Gemma variants on OpenRouter) emit inside assistant - content instead of via the structured ``tool_calls`` field: - * ```` - * ```` - * ```` - * ```` - * ```` - * ```` (Gemma style) - Ported from openclaw/openclaw#67318. The ```` variant is - boundary-gated (only strips when the tag sits at start-of-line or - after punctuation and carries a ``name="..."`` attribute) so prose - mentions like "Use in JavaScript" are preserved. - """ - if not content: - return "" - # 1. Closed tag pairs — case-insensitive for all variants so - # mixed-case tags (, ) don't slip through to - # the unterminated-tag pass and take trailing content with them. - content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) - content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) - content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) - content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) - content = re.sub(r'.*?', '', content, flags=re.DOTALL | re.IGNORECASE) - # 1b. Tool-call XML blocks (openclaw/openclaw#67318). Handle the - # generic tag names first — they have no attribute gating since - # a literal in prose is already vanishingly rare. - for _tc_name in ("tool_call", "tool_calls", "tool_result", - "function_call", "function_calls"): - content = re.sub( - rf'<{_tc_name}\b[^>]*>.*?', - '', - content, - flags=re.DOTALL | re.IGNORECASE, - ) - # 1c. ... — Gemma-style standalone - # tool call. Only strip when the tag sits at a block boundary - # (start of text, after a newline, or after sentence-ending - # punctuation) AND carries a name="..." attribute. This keeps - # prose mentions like "Use to declare" safe. - content = re.sub( - r'(?:(?<=^)|(?<=[\n\r.!?:]))[ \t]*' - r']*\bname\s*=[^>]*>' - r'(?:(?:(?!).)*)', - '', - content, - flags=re.DOTALL | re.IGNORECASE, - ) - # 2. Unterminated reasoning block — open tag at a block boundary - # (start of text, or after a newline) with no matching close. - # Strip from the tag to end of string. Fixes #8878 / #9568 - # (MiniMax M2.7 leaking raw reasoning into assistant content). - content = re.sub( - r'(?:^|\n)[ \t]*<(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)\b[^>]*>.*$', - '', - content, - flags=re.DOTALL | re.IGNORECASE, - ) - # 3. Stray orphan open/close tags that slipped through. - content = re.sub( - r'\s*', - '', - content, - flags=re.IGNORECASE, - ) - # 3b. Stray tool-call closers. (We do NOT strip bare or - # unterminated because a truncated tail - # during streaming may still be valuable to the user; matches - # OpenClaw's intentional asymmetry.) - content = re.sub( - r'\s*', - '', - content, - flags=re.IGNORECASE, - ) - return content + """Forwarder — see ``agent.agent_runtime_helpers.strip_think_blocks``.""" + from agent.agent_runtime_helpers import strip_think_blocks + return strip_think_blocks(self, content) @staticmethod def _has_natural_response_ending(content: str) -> bool: @@ -3895,366 +1068,27 @@ class AIAgent: assistant_content: str, messages: List[Dict[str, Any]], ) -> bool: - """Detect a planning/ack message that should continue instead of ending the turn.""" - if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages): - return False - - assistant_text = self._strip_think_blocks(assistant_content or "").strip().lower() - if not assistant_text: - return False - if len(assistant_text) > 1200: - return False - - has_future_ack = bool( - re.search(r"\b(i['’]ll|i will|let me|i can do that|i can help with that)\b", assistant_text) - ) - if not has_future_ack: - return False - - action_markers = ( - "look into", - "look at", - "inspect", - "scan", - "check", - "analyz", - "review", - "explore", - "read", - "open", - "run", - "test", - "fix", - "debug", - "search", - "find", - "walkthrough", - "report back", - "summarize", - ) - workspace_markers = ( - "directory", - "current directory", - "current dir", - "cwd", - "repo", - "repository", - "codebase", - "project", - "folder", - "filesystem", - "file tree", - "files", - "path", - ) - - user_text = (user_message or "").strip().lower() - user_targets_workspace = ( - any(marker in user_text for marker in workspace_markers) - or "~/" in user_text - or "/" in user_text - ) - assistant_mentions_action = any(marker in assistant_text for marker in action_markers) - assistant_targets_workspace = any( - marker in assistant_text for marker in workspace_markers - ) - return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action - + """Forwarder — see ``agent.agent_runtime_helpers.looks_like_codex_intermediate_ack``.""" + from agent.agent_runtime_helpers import looks_like_codex_intermediate_ack + return looks_like_codex_intermediate_ack(self, user_message, assistant_content, messages) def _extract_reasoning(self, assistant_message) -> Optional[str]: - """ - Extract reasoning/thinking content from an assistant message. - - OpenRouter and various providers can return reasoning in multiple formats: - 1. message.reasoning - Direct reasoning field (DeepSeek, Qwen, etc.) - 2. message.reasoning_content - Alternative field (Moonshot AI, Novita, etc.) - 3. message.reasoning_details - Array of {type, summary, ...} objects (OpenRouter unified) - - Args: - assistant_message: The assistant message object from the API response - - Returns: - Combined reasoning text, or None if no reasoning found - """ - reasoning_parts = [] - - # Check direct reasoning field - if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning: - reasoning_parts.append(assistant_message.reasoning) - - # Check reasoning_content field (alternative name used by some providers) - if hasattr(assistant_message, 'reasoning_content') and assistant_message.reasoning_content: - # Don't duplicate if same as reasoning - if assistant_message.reasoning_content not in reasoning_parts: - reasoning_parts.append(assistant_message.reasoning_content) - - # Check reasoning_details array (OpenRouter unified format) - # Format: [{"type": "reasoning.summary", "summary": "...", ...}, ...] - if hasattr(assistant_message, 'reasoning_details') and assistant_message.reasoning_details: - for detail in assistant_message.reasoning_details: - if isinstance(detail, dict): - # Extract summary from reasoning detail object - summary = ( - detail.get('summary') - or detail.get('thinking') - or detail.get('content') - or detail.get('text') - ) - if summary and summary not in reasoning_parts: - reasoning_parts.append(summary) - - # Some providers embed reasoning directly inside assistant content - # instead of returning structured reasoning fields. Only fall back - # to inline extraction when no structured reasoning was found. - content = getattr(assistant_message, "content", None) - if not reasoning_parts and isinstance(content, list): - # DeepSeek V4 Pro (and compatible providers) return content as a - # list of typed blocks, e.g.: - # [{"type": "thinking", "thinking": "..."}, {"type": "output", ...}] - # Without this branch the thinking text is silently dropped and the - # next turn fails with HTTP 400 ("thinking must be passed back"). - # Refs #21944. - for block in content: - if isinstance(block, dict) and block.get("type") == "thinking": - thinking_text = block.get("thinking") or block.get("text") or "" - thinking_text = thinking_text.strip() - if thinking_text and thinking_text not in reasoning_parts: - reasoning_parts.append(thinking_text) - if not reasoning_parts and isinstance(content, str) and content: - inline_patterns = ( - r"(.*?)", - r"(.*?)", - r"(.*?)", - r"(.*?)", - r"(.*?)", - ) - for pattern in inline_patterns: - flags = re.DOTALL | re.IGNORECASE - for block in re.findall(pattern, content, flags=flags): - cleaned = block.strip() - if cleaned and cleaned not in reasoning_parts: - reasoning_parts.append(cleaned) - - # Combine all reasoning parts - if reasoning_parts: - return "\n\n".join(reasoning_parts) - - return None + """Forwarder — see ``agent.agent_runtime_helpers.extract_reasoning``.""" + from agent.agent_runtime_helpers import extract_reasoning + return extract_reasoning(self, assistant_message) def _cleanup_task_resources(self, task_id: str) -> None: - """Clean up VM and browser resources for a given task. - - Skips ``cleanup_vm`` when the active terminal environment is marked - persistent (``persistent_filesystem=True``) so that long-lived sandbox - containers survive between turns. The idle reaper in - ``terminal_tool._cleanup_inactive_envs`` still tears them down once - ``terminal.lifetime_seconds`` is exceeded. Non-persistent backends are - torn down per-turn as before to prevent resource leakage (the original - intent of this hook for the Morph backend, see commit fbd3a2fd). - """ - try: - if is_persistent_env(task_id): - if self.verbose_logging: - logging.debug( - f"Skipping per-turn cleanup_vm for persistent env {task_id}; " - f"idle reaper will handle it." - ) - else: - cleanup_vm(task_id) - except Exception as e: - if self.verbose_logging: - logging.warning(f"Failed to cleanup VM for task {task_id}: {e}") - try: - cleanup_browser(task_id) - except Exception as e: - if self.verbose_logging: - logging.warning(f"Failed to cleanup browser for task {task_id}: {e}") + """Forwarder — see ``agent.chat_completion_helpers.cleanup_task_resources``.""" + from agent.chat_completion_helpers import cleanup_task_resources + return cleanup_task_resources(self, task_id) # ------------------------------------------------------------------ - # Background memory/skill review + # Background memory/skill review — prompts live in agent.background_review # ------------------------------------------------------------------ - - _MEMORY_REVIEW_PROMPT = ( - "Review the conversation above and consider saving to memory if appropriate.\n\n" - "Focus on:\n" - "1. Has the user revealed things about themselves — their persona, desires, " - "preferences, or personal details worth remembering?\n" - "2. Has the user expressed expectations about how you should behave, their work " - "style, or ways they want you to operate?\n\n" - "If something stands out, save it using the memory tool. " - "If nothing is worth saving, just say 'Nothing to save.' and stop." - ) - - _SKILL_REVIEW_PROMPT = ( - "Review the conversation above and update the skill library. Be " - "ACTIVE — most sessions produce at least one skill update, even if " - "small. A pass that does nothing is a missed learning opportunity, " - "not a neutral outcome.\n\n" - "Target shape of the library: CLASS-LEVEL skills, each with a rich " - "SKILL.md and a `references/` directory for session-specific detail. " - "Not a long flat list of narrow one-session-one-skill entries. This " - "shapes HOW you update, not WHETHER you update.\n\n" - "Signals to look for (any one of these warrants action):\n" - " • User corrected your style, tone, format, legibility, or " - "verbosity. Frustration signals like 'stop doing X', 'this is too " - "verbose', 'don't format like this', 'why are you explaining', " - "'just give me the answer', 'you always do Y and I hate it', or an " - "explicit 'remember this' are FIRST-CLASS skill signals, not just " - "memory signals. Update the relevant skill(s) to embed the " - "preference so the next session starts already knowing.\n" - " • User corrected your workflow, approach, or sequence of steps. " - "Encode the correction as a pitfall or explicit step in the skill " - "that governs that class of task.\n" - " • Non-trivial technique, fix, workaround, debugging path, or " - "tool-usage pattern emerged that a future session would benefit " - "from. Capture it.\n" - " • A skill that got loaded or consulted this session turned out " - "to be wrong, missing a step, or outdated. Patch it NOW.\n\n" - "Preference order — prefer the earliest action that fits, but do " - "pick one when a signal above fired:\n" - " 1. UPDATE A CURRENTLY-LOADED SKILL. Look back through the " - "conversation for skills the user loaded via /skill-name or you " - "read via skill_view. If any of them covers the territory of the " - "new learning, PATCH that one first. It is the skill that was in " - "play, so it's the right one to extend.\n" - " 2. UPDATE AN EXISTING UMBRELLA (via skills_list + skill_view). " - "If no loaded skill fits but an existing class-level skill does, " - "patch it. Add a subsection, a pitfall, or broaden a trigger.\n" - " 3. ADD A SUPPORT FILE under an existing umbrella. Skills can be " - "packaged with three kinds of support files — use the right " - "directory per kind:\n" - " • `references/.md` — session-specific detail (error " - "transcripts, reproduction recipes, provider quirks) AND " - "condensed knowledge banks: quoted research, API docs, external " - "authoritative excerpts, or domain notes you found while working " - "on the problem. Write it concise and for the value of the task, " - "not as a full mirror of upstream docs.\n" - " • `templates/.` — starter files meant to be " - "copied and modified (boilerplate configs, scaffolding, a " - "known-good example the agent can `reproduce with modifications`).\n" - " • `scripts/.` — statically re-runnable actions " - "the skill can invoke directly (verification scripts, fixture " - "generators, deterministic probes, anything the agent should run " - "rather than hand-type each time).\n" - " Add support files via skill_manage action=write_file with " - "file_path starting 'references/', 'templates/', or 'scripts/'. " - "The umbrella's SKILL.md should gain a one-line pointer to any " - "new support file so future agents know it exists.\n" - " 4. CREATE A NEW CLASS-LEVEL UMBRELLA SKILL when no existing " - "skill covers the class. The name MUST be at the class level. " - "The name MUST NOT be a specific PR number, error string, feature " - "codename, library-alone name, or 'fix-X / debug-Y / audit-Z-today' " - "session artifact. If the proposed name only makes sense for " - "today's task, it's wrong — fall back to (1), (2), or (3).\n\n" - "User-preference embedding (important): when the user expressed a " - "style/format/workflow preference, the update belongs in the " - "SKILL.md body, not just in memory. Memory captures 'who the user " - "is and what the current situation and state of your operations " - "are'; skills capture 'how to do this class of task for this " - "user'. When they complain about how you handled a task, the " - "skill that governs that task needs to carry the lesson.\n\n" - "If you notice two existing skills that overlap, note it in your " - "reply — the background curator handles consolidation at scale.\n\n" - "Do NOT capture (these become persistent self-imposed constraints " - "that bite you later when the environment changes):\n" - " • Environment-dependent failures: missing binaries, fresh-install " - "errors, post-migration path mismatches, 'command not found', " - "unconfigured credentials, uninstalled packages. The user can fix " - "these — they are not durable rules.\n" - " • Negative claims about tools or features ('browser tools do not " - "work', 'X tool is broken', 'cannot use Y from execute_code'). These " - "harden into refusals the agent cites against itself for months " - "after the actual problem was fixed.\n" - " • Session-specific transient errors that resolved before the " - "conversation ended. If retrying worked, the lesson is the retry " - "pattern, not the original failure.\n" - " • One-off task narratives. A user asking 'summarize today's " - "market' or 'analyze this PR' is not a class of work that warrants " - "a skill.\n\n" - "If a tool failed because of setup state, capture the FIX (install " - "command, config step, env var to set) under an existing setup or " - "troubleshooting skill — never 'this tool does not work' as a " - "standalone constraint.\n\n" - "'Nothing to save.' is a real option but should NOT be the " - "default. If the session ran smoothly with no corrections and " - "produced no new technique, just say 'Nothing to save.' and stop. " - "Otherwise, act." - ) - - _COMBINED_REVIEW_PROMPT = ( - "Review the conversation above and update two things:\n\n" - "**Memory**: who the user is. Did the user reveal persona, " - "desires, preferences, personal details, or expectations about " - "how you should behave? Save facts about the user and durable " - "preferences with the memory tool.\n\n" - "**Skills**: how to do this class of task. Be ACTIVE — most " - "sessions produce at least one skill update. A pass that does " - "nothing is a missed learning opportunity, not a neutral outcome.\n\n" - "Target shape of the skill library: CLASS-LEVEL skills with a rich " - "SKILL.md and a `references/` directory for session-specific detail. " - "Not a long flat list of narrow one-session-one-skill entries.\n\n" - "Signals that warrant a skill update (any one is enough):\n" - " • User corrected your style, tone, format, legibility, " - "verbosity, or approach. Frustration is a FIRST-CLASS skill " - "signal, not just a memory signal. 'stop doing X', 'don't format " - "like this', 'I hate when you Y' — embed the lesson in the skill " - "that governs that task so the next session starts fixed.\n" - " • Non-trivial technique, fix, workaround, or debugging path " - "emerged.\n" - " • A skill that was loaded or consulted turned out wrong, " - "missing, or outdated — patch it now.\n\n" - "Preference order for skills — pick the earliest that fits:\n" - " 1. UPDATE A CURRENTLY-LOADED SKILL. Check what skills were " - "loaded via /skill-name or skill_view in the conversation. If one " - "of them covers the learning, PATCH it first. It was in play; " - "it's the right place.\n" - " 2. UPDATE AN EXISTING UMBRELLA (skills_list + skill_view to " - "find the right one). Patch it.\n" - " 3. ADD A SUPPORT FILE under an existing umbrella via " - "skill_manage action=write_file. Three kinds: " - "`references/.md` for session-specific detail OR condensed " - "knowledge banks (quoted research, API docs excerpts, domain " - "notes) written concise and task-focused; `templates/.` " - "for starter files meant to be copied and modified; " - "`scripts/.` for statically re-runnable actions " - "(verification, fixture generators, probes). Add a one-line " - "pointer in SKILL.md so future agents find them.\n" - " 4. CREATE A NEW CLASS-LEVEL UMBRELLA when nothing exists. " - "Name at the class level — NOT a PR number, error string, " - "codename, library-alone name, or 'fix-X / debug-Y' session " - "artifact. If the name only fits today's task, fall back to (1), " - "(2), or (3).\n\n" - "User-preference embedding: when the user complains about how " - "you handled a task, update the skill that governs that task — " - "memory alone isn't enough. Memory says 'who the user is and " - "what the current situation and state of your operations are'; " - "skills say 'how to do this class of task for this user'. Both " - "should carry user-preference lessons when relevant.\n\n" - "If you notice overlapping existing skills, mention it — the " - "background curator handles consolidation.\n\n" - "Do NOT capture as skills (these become persistent self-imposed " - "constraints that bite you later when the environment changes):\n" - " • Environment-dependent failures: missing binaries, fresh-install " - "errors, post-migration path mismatches, 'command not found', " - "unconfigured credentials, uninstalled packages. The user can fix " - "these — they are not durable rules.\n" - " • Negative claims about tools or features ('browser tools do not " - "work', 'X tool is broken', 'cannot use Y from execute_code'). These " - "harden into refusals the agent cites against itself for months " - "after the actual problem was fixed.\n" - " • Session-specific transient errors that resolved before the " - "conversation ended. If retrying worked, the lesson is the retry " - "pattern, not the original failure.\n" - " • One-off task narratives. A user asking 'summarize today's " - "market' or 'analyze this PR' is not a class of work that warrants " - "a skill.\n\n" - "If a tool failed because of setup state, capture the FIX (install " - "command, config step, env var to set) under an existing setup or " - "troubleshooting skill — never 'this tool does not work' as a " - "standalone constraint.\n\n" - "Act on whichever of the two dimensions has real signal. If " - "genuinely nothing stands out on either, say 'Nothing to save.' " - "and stop — but don't reach for that conclusion as a default." + from agent.background_review import ( + _MEMORY_REVIEW_PROMPT, + _SKILL_REVIEW_PROMPT, + _COMBINED_REVIEW_PROMPT, ) @staticmethod @@ -4262,63 +1096,9 @@ class AIAgent: review_messages: List[Dict], prior_snapshot: List[Dict], ) -> List[str]: - """Build the human-facing action summary for a background review pass. - - Walks the review agent's session messages and collects "successful tool - action" descriptions to surface to the user (e.g. "Memory updated"). - Tool messages already present in ``prior_snapshot`` are skipped so we - don't re-surface stale results from the prior conversation that the - review agent inherited via ``conversation_history`` (issue #14944). - - Matching is by ``tool_call_id`` when available, with a content-equality - fallback for tool messages that lack one. - """ - existing_tool_call_ids = set() - existing_tool_contents = set() - for prior in prior_snapshot or []: - if not isinstance(prior, dict) or prior.get("role") != "tool": - continue - tcid = prior.get("tool_call_id") - if tcid: - existing_tool_call_ids.add(tcid) - else: - content = prior.get("content") - if isinstance(content, str): - existing_tool_contents.add(content) - - actions: List[str] = [] - for msg in review_messages or []: - if not isinstance(msg, dict) or msg.get("role") != "tool": - continue - tcid = msg.get("tool_call_id") - if tcid and tcid in existing_tool_call_ids: - continue - if not tcid: - content_str = msg.get("content") - if isinstance(content_str, str) and content_str in existing_tool_contents: - continue - try: - data = json.loads(msg.get("content", "{}")) - except (json.JSONDecodeError, TypeError): - continue - if not isinstance(data, dict) or not data.get("success"): - continue - message = data.get("message", "") - target = data.get("target", "") - if "created" in message.lower(): - actions.append(message) - elif "updated" in message.lower(): - actions.append(message) - elif "added" in message.lower() or (target and "add" in message.lower()): - label = "Memory" if target == "memory" else "User profile" if target == "user" else target - actions.append(f"{label} updated") - elif "Entry added" in message: - label = "Memory" if target == "memory" else "User profile" if target == "user" else target - actions.append(f"{label} updated") - elif "removed" in message.lower() or "replaced" in message.lower(): - label = "Memory" if target == "memory" else "User profile" if target == "user" else target - actions.append(f"{label} updated") - return actions + """Forwarder — see ``agent.background_review.summarize_background_review_actions``.""" + from agent.background_review import summarize_background_review_actions + return summarize_background_review_actions(review_messages, prior_snapshot) def _spawn_background_review( self, @@ -4326,235 +1106,22 @@ class AIAgent: review_memory: bool = False, review_skills: bool = False, ) -> None: - """Spawn a background thread to review the conversation for memory/skill saves. + """Spawn the background memory/skill review thread. - Creates a full AIAgent fork with the same model, tools, and context as the - main session. The review prompt is appended as the next user turn in the - forked conversation. Writes directly to the shared memory/skill stores. - Never modifies the main conversation history or produces user-visible output. + Thin wrapper — the heavy lifting lives in + ``agent.background_review.spawn_background_review_thread`` which + returns the thread target. ``threading.Thread`` is constructed + here so existing tests that patch ``run_agent.threading.Thread`` + keep working. """ - import threading - - # Pick the right prompt based on which triggers fired - if review_memory and review_skills: - prompt = self._COMBINED_REVIEW_PROMPT - elif review_memory: - prompt = self._MEMORY_REVIEW_PROMPT - else: - prompt = self._SKILL_REVIEW_PROMPT - - def _run_review(): - import contextlib - # Install a non-interactive approval callback on this worker - # thread so any dangerous-command guard the review agent trips - # resolves to "deny" instead of falling back to input() -- which - # deadlocks against the parent's prompt_toolkit TUI (#15216). - # Same pattern as _subagent_auto_deny in tools/delegate_tool.py. - def _bg_review_auto_deny(command, description, **kwargs): - logger.warning( - "Background review auto-denied dangerous command: %s (%s)", - command, description, - ) - return "deny" - try: - _set_approval_callback(_bg_review_auto_deny) - except Exception: - pass - review_agent = None - review_messages = [] - try: - with open(os.devnull, "w", encoding="utf-8") as _devnull, \ - contextlib.redirect_stdout(_devnull), \ - contextlib.redirect_stderr(_devnull): - # Inherit the parent agent's live runtime (provider, model, - # base_url, api_key, api_mode) so the fork uses the exact - # same credentials the main turn is using. Without this, - # AIAgent.__init__ re-runs auto-resolution from env vars, - # which fails for OAuth-only providers, session-scoped - # creds, or credential-pool setups where the resolver can't - # reconstruct auth from scratch -- producing the spurious - # "No LLM provider configured" warning at end of turn. - _parent_runtime = self._current_main_runtime() - _parent_api_mode = _parent_runtime.get("api_mode") or None - # The review fork needs to call agent-loop tools (memory, - # skill_manage). Those tools require Hermes' own dispatch, - # which the codex_app_server runtime bypasses entirely - # (it runs the turn inside codex's subprocess). So when - # the parent is on codex_app_server, downgrade the review - # fork to codex_responses — same auth/credentials, but - # talks to the OpenAI Responses API directly so Hermes - # owns the loop and the agent-loop tools dispatch. - if _parent_api_mode == "codex_app_server": - _parent_api_mode = "codex_responses" - # skip_memory=True keeps the review fork from - # touching external memory plugins (honcho, mem0, - # supermemory, etc.). Without it, the fork's - # __init__ rebuilds its own _memory_manager from - # config, scoped to the parent's session_id, and - # run_conversation() then leaks the harness prompt - # into the user's real memory namespace via three - # ingestion sites: on_turn_start (cadence + turn - # message), prefetch_all (recall query), and - # sync_all (harness prompt + review output recorded - # as a (user, assistant) turn pair). Built-in - # MEMORY.md / USER.md state is re-bound from the - # parent below so memory(action="add") writes from - # the review still land on disk; the review just - # has zero side effects on external providers. - review_agent = AIAgent( - model=self.model, - max_iterations=16, - quiet_mode=True, - platform=self.platform, - provider=self.provider, - api_mode=_parent_api_mode, - base_url=_parent_runtime.get("base_url") or None, - api_key=_parent_runtime.get("api_key") or None, - credential_pool=getattr(self, "_credential_pool", None), - parent_session_id=self.session_id, - skip_memory=True, - ) - review_agent._memory_write_origin = "background_review" - review_agent._memory_write_context = "background_review" - review_agent._memory_store = self._memory_store - review_agent._memory_enabled = self._memory_enabled - review_agent._user_profile_enabled = self._user_profile_enabled - review_agent._memory_nudge_interval = 0 - review_agent._skill_nudge_interval = 0 - # Suppress all status/warning emits from the fork so the - # user only sees the final successful-action summary. - # Without this, mid-review "Iteration budget exhausted", - # rate-limit retries, compression warnings, and other - # lifecycle messages bubble up through _emit_status -> - # _vprint and leak past the stdout redirect (they go via - # _print_fn/status_callback, which bypass sys.stdout). - review_agent.suppress_status_output = True - # Inherit the parent's cached system prompt verbatim so - # the review fork's outbound HTTP request hits the same - # Anthropic/OpenRouter prefix cache the parent warmed. - # Without this, the fork rebuilds the system prompt from - # scratch (fresh _hermes_now() timestamp, fresh - # session_id, narrower toolset → different skills_prompt) - # and the byte-exact prefix-cache key misses. See - # issue #25322 and PR #17276 for the full analysis + - # measured impact (~26% end-to-end cost reduction on - # Sonnet 4.5). - review_agent._cached_system_prompt = self._cached_system_prompt - # Defensive: pin session_start + session_id to the - # parent's so any code path that re-renders parts of - # the system prompt (compression, plugin hooks) still - # produces byte-identical output. The cached-prompt - # assignment above already short-circuits the normal - # rebuild path, but these pins guarantee parity even - # if a future code path bypasses the cache. - review_agent.session_start = self.session_start - review_agent.session_id = self.session_id - - from model_tools import get_tool_definitions - from hermes_cli.plugins import ( - set_thread_tool_whitelist, - clear_thread_tool_whitelist, - ) - - review_whitelist = { - t["function"]["name"] - for t in get_tool_definitions( - enabled_toolsets=["memory", "skills"], - quiet_mode=True, - ) - } - set_thread_tool_whitelist( - review_whitelist, - deny_msg_fmt=( - "Background review denied non-whitelisted tool: " - "{tool_name}. Only memory/skill tools are allowed." - ), - ) - try: - review_agent.run_conversation( - user_message=( - prompt - + "\n\nYou can only call memory and skill " - "management tools. Other tools will be denied " - "at runtime — do not attempt them." - ), - conversation_history=messages_snapshot, - ) - finally: - clear_thread_tool_whitelist() - - # Tear down memory providers while stdout is still - # redirected so background thread teardown (Honcho flush, - # Hindsight sync, etc.) stays silent. The finally block - # below is a safety net for the exception path. - try: - review_agent.shutdown_memory_provider() - except Exception: - pass - try: - review_agent.close() - except Exception: - pass - review_messages = list(getattr(review_agent, "_session_messages", [])) - review_agent = None - - # Scan the review agent's messages for successful tool actions - # and surface a compact summary to the user. Tool messages - # already present in messages_snapshot must be skipped, since - # the review agent inherits that history and would otherwise - # re-surface stale "created"/"updated" messages from the prior - # conversation as if they just happened (issue #14944). - actions = self._summarize_background_review_actions( - review_messages, - messages_snapshot, - ) - - if actions: - summary = " · ".join(dict.fromkeys(actions)) - self._safe_print( - f" 💾 Self-improvement review: {summary}" - ) - _bg_cb = self.background_review_callback - if _bg_cb: - try: - _bg_cb( - f"💾 Self-improvement review: {summary}" - ) - except Exception: - pass - - except Exception as e: - logger.warning("Background memory/skill review failed: %s", e) - self._emit_auxiliary_failure("background review", e) - finally: - # Safety-net cleanup for the exception path. Normal - # completion already shut down inside redirect_stdout above. - # Re-open devnull here so any teardown output (Honcho flush, - # Hindsight sync, background thread joins) stays silent even - # on the exception path where redirect_stdout already exited. - if review_agent is not None: - try: - with open(os.devnull, "w", encoding="utf-8") as _fn, \ - contextlib.redirect_stdout(_fn), \ - contextlib.redirect_stderr(_fn): - try: - review_agent.shutdown_memory_provider() - except Exception: - pass - try: - review_agent.close() - except Exception: - pass - except Exception: - pass - # Clear the approval callback on this bg-review thread so a - # recycled thread-id doesn't inherit a stale reference. - try: - _set_approval_callback(None) - except Exception: - pass - - t = threading.Thread(target=_run_review, daemon=True, name="bg-review") + from agent.background_review import spawn_background_review_thread + target, _prompt = spawn_background_review_thread( + self, + messages_snapshot, + review_memory=review_memory, + review_skills=review_skills, + ) + t = threading.Thread(target=target, daemon=True, name="bg-review") t.start() def _build_memory_write_metadata( @@ -4565,23 +1132,15 @@ class AIAgent: task_id: Optional[str] = None, tool_call_id: Optional[str] = None, ) -> Dict[str, Any]: - """Build provenance metadata for external memory-provider mirrors.""" - metadata: Dict[str, Any] = { - "write_origin": write_origin or getattr(self, "_memory_write_origin", "assistant_tool"), - "execution_context": ( - execution_context - or getattr(self, "_memory_write_context", "foreground") - ), - "session_id": self.session_id or "", - "parent_session_id": self._parent_session_id or "", - "platform": self.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"), - "tool_name": "memory", - } - if task_id: - metadata["task_id"] = task_id - if tool_call_id: - metadata["tool_call_id"] = tool_call_id - return {k: v for k, v in metadata.items() if v not in {None, ""}} + """Forwarder — see ``agent.background_review.build_memory_write_metadata``.""" + from agent.background_review import build_memory_write_metadata + return build_memory_write_metadata( + self, + write_origin=write_origin, + execution_context=execution_context, + task_id=task_id, + tool_call_id=tool_call_id, + ) def _apply_persist_user_message_override(self, messages: List[Dict]) -> None: """Rewrite the current-turn user message before persistence/return. @@ -4666,104 +1225,9 @@ class AIAgent: messages.pop() def _repair_message_sequence(self, messages: List[Dict]) -> int: - """Collapse malformed role-alternation left in the live history. - - Providers (OpenAI, OpenRouter, Anthropic) expect strict alternation: - after the system message, user/tool alternates with assistant, with - no two consecutive user messages and no tool-result that doesn't - follow an assistant-with-tool_calls. Violations cause silent empty - responses on most providers, which triggers the empty-retry loop. - - This runs right before the API call as a defensive belt — by the - time it fires, the scaffolding strip should already have prevented - most shapes, but external callers (gateway multi-queue replay, - session resume, cron, explicit conversation_history passed in by - host code) can feed in already-broken histories. - - Repairs applied: - 1. Stray ``tool`` messages whose ``tool_call_id`` doesn't match - any preceding assistant tool_call — dropped. - 2. Consecutive ``user`` messages — merged with newline separator - so no user input is lost. - - Deliberately does NOT rewind orphan ``assistant(tool_calls)+tool`` - pairs that precede a user message — that pattern IS valid when the - previous turn completed normally and the user jumped in to redirect - before the model got a continuation turn (the ongoing dialog - pattern). The empty-response scaffolding stripper handles the - genuinely-broken variant via its flag-gated rewind. - - Returns the number of repairs made (for logging/telemetry). - """ - if not messages: - return 0 - - repairs = 0 - - # Pass 1: drop stray tool messages that don't follow a known - # assistant tool_call_id. Uses a rolling set of known ids refreshed - # on each assistant message. - known_tool_ids: set = set() - filtered: List[Dict] = [] - for msg in messages: - if not isinstance(msg, dict): - filtered.append(msg) - continue - role = msg.get("role") - if role == "assistant": - known_tool_ids = set() - for tc in (msg.get("tool_calls") or []): - tc_id = tc.get("id") if isinstance(tc, dict) else None - if tc_id: - known_tool_ids.add(tc_id) - filtered.append(msg) - elif role == "tool": - tc_id = msg.get("tool_call_id") - if tc_id and tc_id in known_tool_ids: - filtered.append(msg) - else: - repairs += 1 - else: - if role == "user": - # A user turn closes the tool-result run; subsequent - # tool messages without a fresh assistant tool_call - # are orphans. - known_tool_ids = set() - filtered.append(msg) - - # Pass 2: merge consecutive user messages. Preserves all user input - # so nothing the user typed is lost. - merged: List[Dict] = [] - for msg in filtered: - if ( - merged - and isinstance(msg, dict) - and msg.get("role") == "user" - and isinstance(merged[-1], dict) - and merged[-1].get("role") == "user" - ): - prev = merged[-1] - prev_content = prev.get("content", "") - new_content = msg.get("content", "") - # Only merge plain-text content; leave multimodal (list) - # content alone — collapsing image/audio blocks risks - # mangling the attachment structure. - if isinstance(prev_content, str) and isinstance(new_content, str): - prev["content"] = ( - (prev_content + "\n\n" + new_content) - if prev_content and new_content - else (prev_content or new_content) - ) - repairs += 1 - continue - merged.append(msg) - - if repairs > 0: - # Rewrite in place so downstream paths (persistence, return - # value, session DB flush) see the repaired sequence. - messages[:] = merged - - return repairs + """Forwarder — see ``agent.agent_runtime_helpers.repair_message_sequence``.""" + from agent.agent_runtime_helpers import repair_message_sequence + return repair_message_sequence(self, messages) def _flush_messages_to_session_db(self, messages: List[Dict], conversation_history: List[Dict] = None): """Persist any un-flushed messages to the SQLite session store. @@ -4856,197 +1320,14 @@ class AIAgent: return messages[:last_assistant_idx] def _format_tools_for_system_message(self) -> str: - """ - Format tool definitions for the system message in the trajectory format. - - Returns: - str: JSON string representation of tool definitions - """ - if not self.tools: - return "[]" - - # Convert tool definitions to the format expected in trajectories - formatted_tools = [] - for tool in self.tools: - func = tool["function"] - formatted_tool = { - "name": func["name"], - "description": func.get("description", ""), - "parameters": func.get("parameters", {}), - "required": None # Match the format in the example - } - formatted_tools.append(formatted_tool) - - return json.dumps(formatted_tools, ensure_ascii=False) + """Forwarder — see ``agent.system_prompt.format_tools_for_system_message``.""" + from agent.system_prompt import format_tools_for_system_message + return format_tools_for_system_message(self) def _convert_to_trajectory_format(self, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]: - """ - Convert internal message format to trajectory format for saving. - - Args: - messages (List[Dict]): Internal message history - user_query (str): Original user query - completed (bool): Whether the conversation completed successfully - - Returns: - List[Dict]: Messages in trajectory format - """ - # Normalize multimodal tool results — trajectories are text-only, so - # replace image-bearing tool messages with their text_summary to avoid - # embedding ~1MB base64 blobs into every saved trajectory. - messages = [_trajectory_normalize_msg(m) for m in messages] - trajectory = [] - - # Add system message with tool definitions - system_msg = ( - "You are a function calling AI model. You are provided with function signatures within XML tags. " - "You may call one or more functions to assist with the user query. If available tools are not relevant in assisting " - "with user query, just respond in natural conversational language. Don't make assumptions about what values to plug " - "into functions. After calling & executing the functions, you will be provided with function results within " - " XML tags. Here are the available tools:\n" - f"\n{self._format_tools_for_system_message()}\n\n" - "For each function call return a JSON object, with the following pydantic model json schema for each:\n" - "{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, " - "'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n" - "Each function call should be enclosed within XML tags.\n" - "Example:\n\n{'name': ,'arguments': }\n" - ) - - trajectory.append({ - "from": "system", - "value": system_msg - }) - - # Add the actual user prompt (from the dataset) as the first human message - trajectory.append({ - "from": "human", - "value": user_query - }) - - # Skip the first message (the user query) since we already added it above. - # Prefill messages are injected at API-call time only (not in the messages - # list), so no offset adjustment is needed here. - i = 1 - - while i < len(messages): - msg = messages[i] - - if msg["role"] == "assistant": - # Check if this message has tool calls - if "tool_calls" in msg and msg["tool_calls"]: - # Format assistant message with tool calls - # Add tags around reasoning for trajectory storage - content = "" - - # Prepend reasoning in tags if available (native thinking tokens) - if msg.get("reasoning") and msg["reasoning"].strip(): - content = f"\n{msg['reasoning']}\n\n" - - if msg.get("content") and msg["content"].strip(): - # Convert any tags to tags - # (used when native thinking is disabled and model reasons via XML) - content += convert_scratchpad_to_think(msg["content"]) + "\n" - - # Add tool calls wrapped in XML tags - for tool_call in msg["tool_calls"]: - if not tool_call or not isinstance(tool_call, dict): continue - # Parse arguments - should always succeed since we validate during conversation - # but keep try-except as safety net - try: - arguments = json.loads(tool_call["function"]["arguments"]) if isinstance(tool_call["function"]["arguments"], str) else tool_call["function"]["arguments"] - except json.JSONDecodeError: - # This shouldn't happen since we validate and retry during conversation, - # but if it does, log warning and use empty dict - logging.warning(f"Unexpected invalid JSON in trajectory conversion: {tool_call['function']['arguments'][:100]}") - arguments = {} - - tool_call_json = { - "name": tool_call["function"]["name"], - "arguments": arguments - } - content += f"\n{json.dumps(tool_call_json, ensure_ascii=False)}\n\n" - - # Ensure every gpt turn has a block (empty if no reasoning) - # so the format is consistent for training data - if "" not in content: - content = "\n\n" + content - - trajectory.append({ - "from": "gpt", - "value": content.rstrip() - }) - - # Collect all subsequent tool responses - tool_responses = [] - j = i + 1 - while j < len(messages) and messages[j]["role"] == "tool": - tool_msg = messages[j] - # Format tool response with XML tags - tool_response = "\n" - - # Try to parse tool content as JSON if it looks like JSON - tool_content = tool_msg["content"] - try: - if tool_content.strip().startswith(("{", "[")): - tool_content = json.loads(tool_content) - except (json.JSONDecodeError, AttributeError): - pass # Keep as string if not valid JSON - - tool_index = len(tool_responses) - tool_name = ( - msg["tool_calls"][tool_index]["function"]["name"] - if tool_index < len(msg["tool_calls"]) - else "unknown" - ) - tool_response += json.dumps({ - "tool_call_id": tool_msg.get("tool_call_id", ""), - "name": tool_name, - "content": tool_content - }, ensure_ascii=False) - tool_response += "\n" - tool_responses.append(tool_response) - j += 1 - - # Add all tool responses as a single message - if tool_responses: - trajectory.append({ - "from": "tool", - "value": "\n".join(tool_responses) - }) - i = j - 1 # Skip the tool messages we just processed - - else: - # Regular assistant message without tool calls - # Add tags around reasoning for trajectory storage - content = "" - - # Prepend reasoning in tags if available (native thinking tokens) - if msg.get("reasoning") and msg["reasoning"].strip(): - content = f"\n{msg['reasoning']}\n\n" - - # Convert any tags to tags - # (used when native thinking is disabled and model reasons via XML) - raw_content = msg["content"] or "" - content += convert_scratchpad_to_think(raw_content) - - # Ensure every gpt turn has a block (empty if no reasoning) - if "" not in content: - content = "\n\n" + content - - trajectory.append({ - "from": "gpt", - "value": content.strip() - }) - - elif msg["role"] == "user": - trajectory.append({ - "from": "human", - "value": msg["content"] - }) - - i += 1 - - return trajectory + """Forwarder — see ``agent.agent_runtime_helpers.convert_to_trajectory_format``.""" + from agent.agent_runtime_helpers import convert_to_trajectory_format + return convert_to_trajectory_format(self, messages, user_query, completed) def _save_trajectory(self, messages: List[Dict[str, Any]], user_query: str, completed: bool): """ @@ -5182,68 +1463,9 @@ class AIAgent: @staticmethod def _extract_api_error_context(error: Exception) -> Dict[str, Any]: - """Extract structured rate-limit details from provider errors.""" - context: Dict[str, Any] = {} - - body = getattr(error, "body", None) - payload = None - if isinstance(body, dict): - payload = body.get("error") if isinstance(body.get("error"), dict) else body - if isinstance(payload, dict): - reason = payload.get("code") or payload.get("type") or payload.get("error") - if isinstance(reason, str) and reason.strip(): - context["reason"] = reason.strip() - message = payload.get("message") or payload.get("error_description") - if isinstance(message, str) and message.strip(): - context["message"] = message.strip() - for key in ("resets_at", "reset_at"): - value = payload.get(key) - if value not in {None, ""}: - context["reset_at"] = value - break - retry_after = payload.get("retry_after") - if retry_after not in {None, ""} and "reset_at" not in context: - try: - context["reset_at"] = time.time() + float(retry_after) - except (TypeError, ValueError): - pass - - response = getattr(error, "response", None) - headers = getattr(response, "headers", None) - if headers: - retry_after = headers.get("retry-after") or headers.get("Retry-After") - if retry_after and "reset_at" not in context: - try: - context["reset_at"] = time.time() + float(retry_after) - except (TypeError, ValueError): - pass - ratelimit_reset = headers.get("x-ratelimit-reset") - if ratelimit_reset and "reset_at" not in context: - context["reset_at"] = ratelimit_reset - - if "message" not in context: - raw_message = str(error).strip() - if raw_message: - context["message"] = raw_message[:500] - - if "reset_at" not in context: - message = context.get("message") or "" - if isinstance(message, str): - delay_match = re.search(r"quotaResetDelay[:\s\"]+(\\d+(?:\\.\\d+)?)(ms|s)", message, re.IGNORECASE) - if delay_match: - value = float(delay_match.group(1)) - seconds = value / 1000.0 if delay_match.group(2).lower() == "ms" else value - context["reset_at"] = time.time() + seconds - else: - sec_match = re.search( - r"retry\s+(?:after\s+)?(\d+(?:\.\d+)?)\s*(?:sec|secs|seconds|s\b)", - message, - re.IGNORECASE, - ) - if sec_match: - context["reset_at"] = time.time() + float(sec_match.group(1)) - - return context + """Forwarder — see ``agent.agent_runtime_helpers.extract_api_error_context``.""" + from agent.agent_runtime_helpers import extract_api_error_context + return extract_api_error_context(error) def _usage_summary_for_api_request_hook(self, response: Any) -> Optional[Dict[str, Any]]: """Token buckets for ``post_api_request`` plugins (no raw ``response`` object).""" @@ -5268,80 +1490,9 @@ class AIAgent: reason: str, error: Optional[Exception] = None, ) -> Optional[Path]: - """ - Dump a debug-friendly HTTP request record for the active inference API. - - Captures the request body from api_kwargs (excluding transport-only keys - like timeout). Intended for debugging provider-side 4xx failures where - retries are not useful. - """ - try: - body = copy.deepcopy(api_kwargs) - body.pop("timeout", None) - body = {k: v for k, v in body.items() if v is not None} - - api_key = None - try: - api_key = getattr(self.client, "api_key", None) - except Exception as e: - logger.debug("Could not extract API key for debug dump: %s", e) - - dump_payload: Dict[str, Any] = { - "timestamp": datetime.now().isoformat(), - "session_id": self.session_id, - "reason": reason, - "request": { - "method": "POST", - "url": f"{self.base_url.rstrip('/')}{'/responses' if self.api_mode == 'codex_responses' else '/chat/completions'}", - "headers": { - "Authorization": f"Bearer {self._mask_api_key_for_logs(api_key)}", - "Content-Type": "application/json", - }, - "body": body, - }, - } - - if error is not None: - error_info: Dict[str, Any] = { - "type": type(error).__name__, - "message": str(error), - } - for attr_name in ("status_code", "request_id", "code", "param", "type"): - attr_value = getattr(error, attr_name, None) - if attr_value is not None: - error_info[attr_name] = attr_value - - body_attr = getattr(error, "body", None) - if body_attr is not None: - error_info["body"] = body_attr - - response_obj = getattr(error, "response", None) - if response_obj is not None: - try: - error_info["response_status"] = getattr(response_obj, "status_code", None) - error_info["response_text"] = response_obj.text - except Exception as e: - logger.debug("Could not extract error response details: %s", e) - - dump_payload["error"] = error_info - - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") - dump_file = self.logs_dir / f"request_dump_{self.session_id}_{timestamp}.json" - dump_file.write_text( - json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str), - encoding="utf-8", - ) - - self._vprint(f"{self.log_prefix}🧾 Request debug dump written to: {dump_file}") - - if env_var_enabled("HERMES_DUMP_REQUEST_STDOUT"): - print(json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str)) - - return dump_file - except Exception as dump_error: - if self.verbose_logging: - logging.warning(f"Failed to dump API request debug payload: {dump_error}") - return None + """Forwarder — see ``agent.agent_runtime_helpers.dump_api_request_debug``.""" + from agent.agent_runtime_helpers import dump_api_request_debug + return dump_api_request_debug(self, api_kwargs, reason=reason, error=error) @staticmethod def _clean_session_content(content: str) -> str: @@ -5671,67 +1822,9 @@ class AIAgent: return "\n".join(lines) def _apply_pending_steer_to_tool_results(self, messages: list, num_tool_msgs: int) -> None: - """Append any pending /steer text to the last tool result in this turn. - - Called at the end of a tool-call batch, before the next API call. - The steer is appended to the last ``role:"tool"`` message's content - with a clear marker so the model understands it came from the user - and NOT from the tool itself. Role alternation is preserved — - nothing new is inserted, we only modify existing content. - - Args: - messages: The running messages list. - num_tool_msgs: Number of tool results appended in this batch; - used to locate the tail slice safely. - """ - if num_tool_msgs <= 0 or not messages: - return - steer_text = self._drain_pending_steer() - if not steer_text: - return - # Find the last tool-role message in the recent tail. Skipping - # non-tool messages defends against future code appending - # something else at the boundary. - target_idx = None - for j in range(len(messages) - 1, max(len(messages) - num_tool_msgs - 1, -1), -1): - msg = messages[j] - if isinstance(msg, dict) and msg.get("role") == "tool": - target_idx = j - break - if target_idx is None: - # No tool result in this batch (e.g. all skipped by interrupt); - # put the steer back so the caller's fallback path can deliver - # it as a normal next-turn user message. - _lock = getattr(self, "_pending_steer_lock", None) - if _lock is not None: - with _lock: - if self._pending_steer: - self._pending_steer = self._pending_steer + "\n" + steer_text - else: - self._pending_steer = steer_text - else: - existing = getattr(self, "_pending_steer", None) - self._pending_steer = (existing + "\n" + steer_text) if existing else steer_text - return - marker = f"\n\nUser guidance: {steer_text}" - existing_content = messages[target_idx].get("content", "") - if not isinstance(existing_content, str): - # Anthropic multimodal content blocks — preserve them and append - # a text block at the end. - try: - blocks = list(existing_content) if existing_content else [] - blocks.append({"type": "text", "text": marker.lstrip()}) - messages[target_idx]["content"] = blocks - except Exception: - # Fall back to string replacement if content shape is unexpected. - messages[target_idx]["content"] = f"{existing_content}{marker}" - else: - messages[target_idx]["content"] = existing_content + marker - logger.info( - "Delivered /steer to agent after tool batch (%d chars): %s", - len(steer_text), - steer_text[:120] + ("..." if len(steer_text) > 120 else ""), - ) + """Forwarder — see ``agent.agent_runtime_helpers.apply_pending_steer_to_tool_results``.""" + from agent.agent_runtime_helpers import apply_pending_steer_to_tool_results + return apply_pending_steer_to_tool_results(self, messages, num_tool_msgs) def _touch_activity(self, desc: str) -> None: """Update the last-activity timestamp and description (thread-safe).""" @@ -6052,235 +2145,14 @@ class AIAgent: def _build_system_prompt_parts(self, system_message: str = None) -> Dict[str, str]: - """Assemble the system prompt as three ordered parts. - - Returns a dict with three keys: - * ``stable`` — identity, tool guidance, skills prompt, - environment hints, platform hints, model-family operational - guidance. - * ``context`` — context files (AGENTS.md, .cursorrules, etc.) - and caller-supplied system_message. - * ``volatile`` — memory snapshot, user profile, external - memory provider block, timestamp line. - - Joined into a single string by ``_build_system_prompt`` and - cached on ``_cached_system_prompt`` for the lifetime of the - AIAgent. Hermes never re-renders parts of this string mid- - session — that's the only way to keep upstream prompt caches - warm across turns. - """ - # ── Stable tier ──────────────────────────────────────────────── - stable_parts: List[str] = [] - - # Try SOUL.md as primary identity unless the caller explicitly skipped it. - # Some execution modes (cron) still want HERMES_HOME persona while keeping - # cwd project instructions disabled. - _soul_loaded = False - if self.load_soul_identity or not self.skip_context_files: - _soul_content = load_soul_md() - if _soul_content: - stable_parts.append(_soul_content) - _soul_loaded = True - - if not _soul_loaded: - # Fallback to hardcoded identity - stable_parts.append(DEFAULT_AGENT_IDENTITY) - - # Pointer to the hermes-agent skill + docs for user questions about Hermes itself. - stable_parts.append(HERMES_AGENT_HELP_GUIDANCE) - - # Tool-aware behavioral guidance: only inject when the tools are loaded - tool_guidance = [] - if "memory" in self.valid_tool_names: - tool_guidance.append(MEMORY_GUIDANCE) - if "session_search" in self.valid_tool_names: - tool_guidance.append(SESSION_SEARCH_GUIDANCE) - if "skill_manage" in self.valid_tool_names: - tool_guidance.append(SKILLS_GUIDANCE) - # Kanban worker/orchestrator lifecycle — only present when the - # dispatcher spawned this process (kanban_show check_fn gates on - # HERMES_KANBAN_TASK env var). Normal chat sessions never see - # this block. - if "kanban_show" in self.valid_tool_names: - tool_guidance.append(KANBAN_GUIDANCE) - if tool_guidance: - stable_parts.append(" ".join(tool_guidance)) - - # Computer-use (macOS) — goes in as its own block rather than being - # merged into tool_guidance because the content is multi-paragraph. - if "computer_use" in self.valid_tool_names: - from agent.prompt_builder import COMPUTER_USE_GUIDANCE - stable_parts.append(COMPUTER_USE_GUIDANCE) - - nous_subscription_prompt = build_nous_subscription_prompt(self.valid_tool_names) - if nous_subscription_prompt: - stable_parts.append(nous_subscription_prompt) - # Tool-use enforcement: tells the model to actually call tools instead - # of describing intended actions. Controlled by config.yaml - # agent.tool_use_enforcement: - # "auto" (default) — matches TOOL_USE_ENFORCEMENT_MODELS - # true — always inject (all models) - # false — never inject - # list — custom model-name substrings to match - if self.valid_tool_names: - _enforce = self._tool_use_enforcement - _inject = False - if _enforce is True or (isinstance(_enforce, str) and _enforce.lower() in {"true", "always", "yes", "on"}): - _inject = True - elif _enforce is False or (isinstance(_enforce, str) and _enforce.lower() in {"false", "never", "no", "off"}): - _inject = False - elif isinstance(_enforce, list): - model_lower = (self.model or "").lower() - _inject = any(p.lower() in model_lower for p in _enforce if isinstance(p, str)) - else: - # "auto" or any unrecognised value — use hardcoded defaults - model_lower = (self.model or "").lower() - _inject = any(p in model_lower for p in TOOL_USE_ENFORCEMENT_MODELS) - if _inject: - stable_parts.append(TOOL_USE_ENFORCEMENT_GUIDANCE) - _model_lower = (self.model or "").lower() - # Google model operational guidance (conciseness, absolute - # paths, parallel tool calls, verify-before-edit, etc.) - if "gemini" in _model_lower or "gemma" in _model_lower: - stable_parts.append(GOOGLE_MODEL_OPERATIONAL_GUIDANCE) - # OpenAI GPT/Codex execution discipline (tool persistence, - # prerequisite checks, verification, anti-hallucination). - if "gpt" in _model_lower or "codex" in _model_lower: - stable_parts.append(OPENAI_MODEL_EXECUTION_GUIDANCE) - - has_skills_tools = any(name in self.valid_tool_names for name in ['skills_list', 'skill_view', 'skill_manage']) - if has_skills_tools: - avail_toolsets = { - toolset - for toolset in ( - get_toolset_for_tool(tool_name) for tool_name in self.valid_tool_names - ) - if toolset - } - skills_prompt = build_skills_system_prompt( - available_tools=self.valid_tool_names, - available_toolsets=avail_toolsets, - ) - else: - skills_prompt = "" - if skills_prompt: - stable_parts.append(skills_prompt) - - # Alibaba Coding Plan API always returns "glm-4.7" as model name regardless - # of the requested model. Inject explicit model identity into the system prompt - # so the agent can correctly report which model it is (workaround for API bug). - # Stable for the lifetime of an agent instance — model and provider are fixed - # at construction time. - if self.provider == "alibaba": - _model_short = self.model.split("/")[-1] if "/" in self.model else self.model - stable_parts.append( - f"You are powered by the model named {_model_short}. " - f"The exact model ID is {self.model}. " - f"When asked what model you are, always answer based on this information, " - f"not on any model name returned by the API." - ) - - # Environment hints (WSL, Termux, etc.) — tell the agent about the - # execution environment so it can translate paths and adapt behavior. - # Stable for the lifetime of the process. - _env_hints = build_environment_hints() - if _env_hints: - stable_parts.append(_env_hints) - - platform_key = (self.platform or "").lower().strip() - if platform_key in PLATFORM_HINTS: - stable_parts.append(PLATFORM_HINTS[platform_key]) - elif platform_key: - # Check plugin registry for platform-specific LLM guidance - try: - from gateway.platform_registry import platform_registry - _entry = platform_registry.get(platform_key) - if _entry and _entry.platform_hint: - stable_parts.append(_entry.platform_hint) - except Exception: - pass - - # ── Context tier (cwd-dependent, may change between sessions) ─ - context_parts: List[str] = [] - - # Note: ephemeral_system_prompt is NOT included here. It's injected at - # API-call time only so it stays out of the cached/stored system prompt. - if system_message is not None: - context_parts.append(system_message) - - if not self.skip_context_files: - # Use TERMINAL_CWD for context file discovery when set (gateway - # mode). The gateway process runs from the hermes-agent install - # dir, so os.getcwd() would pick up the repo's AGENTS.md and - # other dev files — inflating token usage by ~10k for no benefit. - _context_cwd = os.getenv("TERMINAL_CWD") or None - context_files_prompt = build_context_files_prompt( - cwd=_context_cwd, skip_soul=_soul_loaded) - if context_files_prompt: - context_parts.append(context_files_prompt) - - # ── Volatile tier (changes per session/turn — never cached) ─── - volatile_parts: List[str] = [] - - if self._memory_store: - if self._memory_enabled: - mem_block = self._memory_store.format_for_system_prompt("memory") - if mem_block: - volatile_parts.append(mem_block) - # USER.md is always included when enabled. - if self._user_profile_enabled: - user_block = self._memory_store.format_for_system_prompt("user") - if user_block: - volatile_parts.append(user_block) - - # External memory provider system prompt block (additive to built-in) - if self._memory_manager: - try: - _ext_mem_block = self._memory_manager.build_system_prompt() - if _ext_mem_block: - volatile_parts.append(_ext_mem_block) - except Exception: - pass - - from hermes_time import now as _hermes_now - now = _hermes_now() - timestamp_line = f"Conversation started: {now.strftime('%A, %B %d, %Y %I:%M %p')}" - if self.pass_session_id and self.session_id: - timestamp_line += f"\nSession ID: {self.session_id}" - if self.model: - timestamp_line += f"\nModel: {self.model}" - if self.provider: - timestamp_line += f"\nProvider: {self.provider}" - volatile_parts.append(timestamp_line) - - return { - "stable": "\n\n".join(p.strip() for p in stable_parts if p and p.strip()), - "context": "\n\n".join(p.strip() for p in context_parts if p and p.strip()), - "volatile": "\n\n".join(p.strip() for p in volatile_parts if p and p.strip()), - } + """Forwarder — see ``agent.system_prompt.build_system_prompt_parts``.""" + from agent.system_prompt import build_system_prompt_parts + return build_system_prompt_parts(self, system_message=system_message) def _build_system_prompt(self, system_message: str = None) -> str: - """ - Assemble the full system prompt from all layers. - - Called once per session (cached on self._cached_system_prompt) and only - rebuilt after context compression events. This ensures the system prompt - is stable across all turns in a session, maximizing prefix cache hits. - - Layers are ordered cache-friendly: stable identity/guidance first, - then session-stable context files, then per-call volatile content - (memory, USER profile, timestamp). The whole string is treated as - one cached block — Hermes never rebuilds or reinjects parts of it - mid-session, which is the only way to keep upstream prompt caches - warm across turns. - """ - parts = self._build_system_prompt_parts(system_message=system_message) - joined = "\n\n".join(p for p in (parts["stable"], parts["context"], parts["volatile"]) if p) - return joined - - # ========================================================================= - # Pre/post-call guardrails (inspired by PR #1321 — @alireza78a) - # ========================================================================= + """Forwarder — see ``agent.system_prompt.build_system_prompt``.""" + from agent.system_prompt import build_system_prompt + return build_system_prompt(self, system_message=system_message) @staticmethod def _get_tool_call_id_static(tc) -> str: @@ -6310,74 +2182,9 @@ class AIAgent: @staticmethod def _sanitize_api_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Fix orphaned tool_call / tool_result pairs before every LLM call. - - Runs unconditionally — not gated on whether the context compressor - is present — so orphans from session loading or manual message - manipulation are always caught. - """ - # --- Role allowlist: drop messages with roles the API won't accept --- - filtered = [] - for msg in messages: - role = msg.get("role") - if role not in AIAgent._VALID_API_ROLES: - logger.debug( - "Pre-call sanitizer: dropping message with invalid role %r", - role, - ) - continue - filtered.append(msg) - messages = filtered - - surviving_call_ids: set = set() - for msg in messages: - if msg.get("role") == "assistant": - for tc in msg.get("tool_calls") or []: - cid = AIAgent._get_tool_call_id_static(tc) - if cid: - surviving_call_ids.add(cid) - - result_call_ids: set = set() - for msg in messages: - if msg.get("role") == "tool": - cid = msg.get("tool_call_id") - if cid: - result_call_ids.add(cid) - - # 1. Drop tool results with no matching assistant call - orphaned_results = result_call_ids - surviving_call_ids - if orphaned_results: - messages = [ - m for m in messages - if not (m.get("role") == "tool" and m.get("tool_call_id") in orphaned_results) - ] - logger.debug( - "Pre-call sanitizer: removed %d orphaned tool result(s)", - len(orphaned_results), - ) - - # 2. Inject stub results for calls whose result was dropped - missing_results = surviving_call_ids - result_call_ids - if missing_results: - patched: List[Dict[str, Any]] = [] - for msg in messages: - patched.append(msg) - if msg.get("role") == "assistant": - for tc in msg.get("tool_calls") or []: - cid = AIAgent._get_tool_call_id_static(tc) - if cid in missing_results: - patched.append({ - "role": "tool", - "name": AIAgent._get_tool_call_name_static(tc), - "content": "[Result unavailable — see context summary above]", - "tool_call_id": cid, - }) - messages = patched - logger.debug( - "Pre-call sanitizer: added %d stub tool result(s)", - len(missing_results), - ) - return messages + """Forwarder — see ``agent.agent_runtime_helpers.sanitize_api_messages``.""" + from agent.agent_runtime_helpers import sanitize_api_messages + return sanitize_api_messages(messages) @staticmethod def _is_thinking_only_assistant(msg: Dict[str, Any]) -> bool: @@ -6437,86 +2244,9 @@ class AIAgent: def _drop_thinking_only_and_merge_users( messages: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: - """Drop thinking-only assistant turns; merge any adjacent user messages left behind. - - Runs on the per-call ``api_messages`` copy only. The stored - conversation history (``self.messages``) is never mutated, so the - user still sees the thinking block in the CLI/gateway transcript and - session persistence keeps the full trace. Only the wire copy sent to - the provider is cleaned. - - Why drop-and-merge rather than inject stub text: - - Fabricating ``"."`` / ``"(continued)"`` text lies in the history - and makes future turns see model output the model didn't emit. - - Dropping the turn preserves honesty; merging adjacent user messages - preserves the provider's role-alternation invariant. - - This is the pattern used by Claude Code's ``normalizeMessagesForAPI`` - (filterOrphanedThinkingOnlyMessages + mergeAdjacentUserMessages). - """ - if not messages: - return messages - - # Pass 1: drop thinking-only assistant turns. - kept = [m for m in messages if not AIAgent._is_thinking_only_assistant(m)] - dropped = len(messages) - len(kept) - if dropped == 0: - return messages - - # Pass 2: merge any newly-adjacent user messages. - merged: List[Dict[str, Any]] = [] - merges = 0 - for m in kept: - prev = merged[-1] if merged else None - if ( - prev is not None - and prev.get("role") == "user" - and m.get("role") == "user" - ): - prev_content = prev.get("content", "") - cur_content = m.get("content", "") - # Work on a copy of ``prev`` so the caller's input dicts are - # never mutated. ``_sanitize_api_messages`` upstream already - # hands us per-call copies, but staying pure here means we - # can be called safely from anywhere (tests, other loops). - prev_copy = dict(prev) - # Only string-content merge is meaningful for role-alternation - # purposes. If either side is a list (multimodal), append as a - # separate block rather than collapsing. - if isinstance(prev_content, str) and isinstance(cur_content, str): - sep = "\n\n" if prev_content and cur_content else "" - prev_copy["content"] = prev_content + sep + cur_content - elif isinstance(prev_content, list) and isinstance(cur_content, list): - prev_copy["content"] = list(prev_content) + list(cur_content) - elif isinstance(prev_content, list) and isinstance(cur_content, str): - if cur_content: - prev_copy["content"] = list(prev_content) + [ - {"type": "text", "text": cur_content} - ] - else: - prev_copy["content"] = list(prev_content) - elif isinstance(prev_content, str) and isinstance(cur_content, list): - new_blocks: List[Dict[str, Any]] = [] - if prev_content: - new_blocks.append({"type": "text", "text": prev_content}) - new_blocks.extend(cur_content) - prev_copy["content"] = new_blocks - else: - # Unknown content shape — fall back to appending separately - # (violates alternation, but safer than raising in a hot path). - merged.append(m) - continue - merged[-1] = prev_copy - merges += 1 - else: - merged.append(m) - - logger.debug( - "Pre-call sanitizer: dropped %d thinking-only assistant turn(s), " - "merged %d adjacent user message(s)", - dropped, - merges, - ) - return merged + """Forwarder — see ``agent.agent_runtime_helpers.drop_thinking_only_and_merge_users``.""" + from agent.agent_runtime_helpers import drop_thinking_only_and_merge_users + return drop_thinking_only_and_merge_users(messages) @staticmethod def _cap_delegate_task_calls(tool_calls: list) -> list: @@ -6568,87 +2298,14 @@ class AIAgent: return unique if len(unique) < len(tool_calls) else tool_calls def _repair_tool_call(self, tool_name: str) -> str | None: - """Attempt to repair a mismatched tool name before aborting. - - Models sometimes emit variants of a tool name that differ only - in casing, separators, or class-like suffixes. Normalize - aggressively before falling back to fuzzy match: - - 1. Lowercase direct match. - 2. Lowercase + hyphens/spaces -> underscores. - 3. CamelCase -> snake_case (TodoTool -> todo_tool). - 4. Strip trailing ``_tool`` / ``-tool`` / ``tool`` suffix that - Claude-style models sometimes tack on (TodoTool_tool -> - TodoTool -> Todo -> todo). Applied twice so double-tacked - suffixes like ``TodoTool_tool`` reduce all the way. - 5. Fuzzy match (difflib, cutoff=0.7). - - See #14784 for the original reports (TodoTool_tool, Patch_tool, - BrowserClick_tool were all returning "Unknown tool" before). - - Returns the repaired name if found in valid_tool_names, else None. - """ - import re - from difflib import get_close_matches - - if not tool_name: - return None - - def _norm(s: str) -> str: - return s.lower().replace("-", "_").replace(" ", "_") - - def _camel_snake(s: str) -> str: - return re.sub(r"(? str | None: - lc = s.lower() - for suffix in ("_tool", "-tool", "tool"): - if lc.endswith(suffix): - return s[: -len(suffix)].rstrip("_-") - return None - - # Cheap fast-paths first — these cover the common case. - lowered = tool_name.lower() - if lowered in self.valid_tool_names: - return lowered - normalized = _norm(tool_name) - if normalized in self.valid_tool_names: - return normalized - - # Build the full candidate set for class-like emissions. - cands: set[str] = {tool_name, lowered, normalized, _camel_snake(tool_name)} - # Strip trailing tool-suffix up to twice — TodoTool_tool needs it. - for _ in range(2): - extra: set[str] = set() - for c in cands: - stripped = _strip_tool_suffix(c) - if stripped: - extra.add(stripped) - extra.add(_norm(stripped)) - extra.add(_camel_snake(stripped)) - cands |= extra - - for c in cands: - if c and c in self.valid_tool_names: - return c - - # Fuzzy match as last resort. - matches = get_close_matches(lowered, self.valid_tool_names, n=1, cutoff=0.7) - if matches: - return matches[0] - - return None + """Forwarder — see ``agent.agent_runtime_helpers.repair_tool_call``.""" + from agent.agent_runtime_helpers import repair_tool_call + return repair_tool_call(self, tool_name) def _invalidate_system_prompt(self): - """ - Invalidate the cached system prompt, forcing a rebuild on the next turn. - - Called after context compression events. Also reloads memory from disk - so the rebuilt prompt captures any writes from this session. - """ - self._cached_system_prompt = None - if self._memory_store: - self._memory_store.load_from_disk() + """Forwarder — see ``agent.system_prompt.invalidate_system_prompt``.""" + from agent.system_prompt import invalidate_system_prompt + invalidate_system_prompt(self) @staticmethod def _deterministic_call_id(fn_name: str, arguments: str, index: int = 0) -> str: @@ -6749,156 +2406,15 @@ class AIAgent: return None def _create_openai_client(self, client_kwargs: dict, *, reason: str, shared: bool) -> Any: - from agent.auxiliary_client import _validate_base_url, _validate_proxy_env_urls - # Treat client_kwargs as read-only. Callers pass self._client_kwargs (or shallow - # copies of it) in; any in-place mutation leaks back into the stored dict and is - # reused on subsequent requests. #10933 hit this by injecting an httpx.Client - # transport that was torn down after the first request, so the next request - # wrapped a closed transport and raised "Cannot send a request, as the client - # has been closed" on every retry. The revert resolved that specific path; this - # copy locks the contract so future transport/keepalive work can't reintroduce - # the same class of bug. - client_kwargs = dict(client_kwargs) - _validate_proxy_env_urls() - _validate_base_url(client_kwargs.get("base_url")) - if self.provider == "copilot-acp" or str(client_kwargs.get("base_url", "")).startswith("acp://copilot"): - from agent.copilot_acp_client import CopilotACPClient - - client = CopilotACPClient(**client_kwargs) - logger.info( - "Copilot ACP client created (%s, shared=%s) %s", - reason, - shared, - self._client_log_context(), - ) - return client - if self.provider == "google-gemini-cli" or str(client_kwargs.get("base_url", "")).startswith("cloudcode-pa://"): - from agent.gemini_cloudcode_adapter import GeminiCloudCodeClient - - # Strip OpenAI-specific kwargs the Gemini client doesn't accept - safe_kwargs = { - k: v for k, v in client_kwargs.items() - if k in {"api_key", "base_url", "default_headers", "project_id", "timeout"} - } - client = GeminiCloudCodeClient(**safe_kwargs) - logger.info( - "Gemini Cloud Code Assist client created (%s, shared=%s) %s", - reason, - shared, - self._client_log_context(), - ) - return client - if self.provider == "gemini": - from agent.gemini_native_adapter import GeminiNativeClient, is_native_gemini_base_url - - base_url = str(client_kwargs.get("base_url", "") or "") - if is_native_gemini_base_url(base_url): - safe_kwargs = { - k: v for k, v in client_kwargs.items() - if k in {"api_key", "base_url", "default_headers", "timeout", "http_client"} - } - if "http_client" not in safe_kwargs: - keepalive_http = self._build_keepalive_http_client(base_url) - if keepalive_http is not None: - safe_kwargs["http_client"] = keepalive_http - client = GeminiNativeClient(**safe_kwargs) - logger.info( - "Gemini native client created (%s, shared=%s) %s", - reason, - shared, - self._client_log_context(), - ) - return client - # Inject TCP keepalives so the kernel detects dead provider connections - # instead of letting them sit silently in CLOSE-WAIT (#10324). Without - # this, a peer that drops mid-stream leaves the socket in a state where - # epoll_wait never fires, ``httpx`` read timeout may not trigger, and - # the agent hangs until manually killed. Probes after 30s idle, retry - # every 10s, give up after 3 → dead peer detected within ~60s. - # - # Safety against #10933: the ``client_kwargs = dict(client_kwargs)`` - # above means this injection only lands in the local per-call copy, - # never back into ``self._client_kwargs``. Each ``_create_openai_client`` - # invocation therefore gets its OWN fresh ``httpx.Client`` whose - # lifetime is tied to the OpenAI client it is passed to. When the - # OpenAI client is closed (rebuild, teardown, credential rotation), - # the paired ``httpx.Client`` closes with it, and the next call - # constructs a fresh one — no stale closed transport can be reused. - # Tests in ``tests/run_agent/test_create_openai_client_reuse.py`` and - # ``tests/run_agent/test_sequential_chats_live.py`` pin this invariant. - if "http_client" not in client_kwargs: - keepalive_http = self._build_keepalive_http_client(client_kwargs.get("base_url", "")) - if keepalive_http is not None: - client_kwargs["http_client"] = keepalive_http - # Uses the module-level `OpenAI` name, resolved lazily on first - # access via __getattr__ below. Tests patch via `run_agent.OpenAI`. - client = OpenAI(**client_kwargs) - logger.info( - "OpenAI client created (%s, shared=%s) %s", - reason, - shared, - self._client_log_context(), - ) - return client + """Forwarder — see ``agent.agent_runtime_helpers.create_openai_client``.""" + from agent.agent_runtime_helpers import create_openai_client + return create_openai_client(self, client_kwargs, reason=reason, shared=shared) @staticmethod def _force_close_tcp_sockets(client: Any) -> int: - """Force-close underlying TCP sockets to prevent CLOSE-WAIT accumulation. - - When a provider drops a connection mid-stream, httpx's ``client.close()`` - performs a graceful shutdown which leaves sockets in CLOSE-WAIT until the - OS times them out (often minutes). This method walks the httpx transport - pool and issues ``socket.shutdown(SHUT_RDWR)`` + ``socket.close()`` to - force an immediate TCP RST, freeing the file descriptors. - - Returns the number of sockets force-closed. - """ - import socket as _socket - - closed = 0 - try: - http_client = getattr(client, "_client", None) - if http_client is None: - return 0 - transport = getattr(http_client, "_transport", None) - if transport is None: - return 0 - pool = getattr(transport, "_pool", None) - if pool is None: - return 0 - # httpx uses httpcore connection pools; connections live in - # _connections (list) or _pool (list) depending on version. - connections = ( - getattr(pool, "_connections", None) - or getattr(pool, "_pool", None) - or [] - ) - for conn in list(connections): - stream = ( - getattr(conn, "_network_stream", None) - or getattr(conn, "_stream", None) - ) - if stream is None: - continue - sock = getattr(stream, "_sock", None) - if sock is None: - sock = getattr(stream, "stream", None) - if sock is not None: - sock = getattr(sock, "_sock", None) - if sock is None: - continue - try: - sock.shutdown(_socket.SHUT_RDWR) - except OSError: - pass - try: - sock.close() - except OSError: - pass - closed += 1 - except Exception as exc: - logger.debug("Force-close TCP sockets sweep error: %s", exc) - return closed + """Forwarder — see ``agent.agent_runtime_helpers.force_close_tcp_sockets``.""" + from agent.agent_runtime_helpers import force_close_tcp_sockets + return force_close_tcp_sockets(client) def _close_openai_client(self, client: Any, *, reason: str, shared: bool) -> None: if client is None: @@ -6958,74 +2474,9 @@ class AIAgent: return self.client def _cleanup_dead_connections(self) -> bool: - """Detect and clean up dead TCP connections on the primary client. - - Inspects the httpx connection pool for sockets in unhealthy states - (CLOSE-WAIT, errors). If any are found, force-closes all sockets - and rebuilds the primary client from scratch. - - Returns True if dead connections were found and cleaned up. - """ - client = getattr(self, "client", None) - if client is None: - return False - try: - http_client = getattr(client, "_client", None) - if http_client is None: - return False - transport = getattr(http_client, "_transport", None) - if transport is None: - return False - pool = getattr(transport, "_pool", None) - if pool is None: - return False - connections = ( - getattr(pool, "_connections", None) - or getattr(pool, "_pool", None) - or [] - ) - dead_count = 0 - for conn in list(connections): - # Check for connections that are idle but have closed sockets - stream = ( - getattr(conn, "_network_stream", None) - or getattr(conn, "_stream", None) - ) - if stream is None: - continue - sock = getattr(stream, "_sock", None) - if sock is None: - sock = getattr(stream, "stream", None) - if sock is not None: - sock = getattr(sock, "_sock", None) - if sock is None: - continue - # Probe socket health with a non-blocking recv peek - import socket as _socket - try: - sock.setblocking(False) - data = sock.recv(1, _socket.MSG_PEEK | _socket.MSG_DONTWAIT) - if data == b"": - dead_count += 1 - except BlockingIOError: - pass # No data available — socket is healthy - except OSError: - dead_count += 1 - finally: - try: - sock.setblocking(True) - except OSError: - pass - if dead_count > 0: - logger.warning( - "Found %d dead connection(s) in client pool — rebuilding client", - dead_count, - ) - self._replace_primary_openai_client(reason="dead_connection_cleanup") - return True - except Exception as exc: - logger.debug("Dead connection check error: %s", exc) - return False + """Forwarder — see ``agent.agent_runtime_helpers.cleanup_dead_connections``.""" + from agent.agent_runtime_helpers import cleanup_dead_connections + return cleanup_dead_connections(self) @staticmethod def _api_kwargs_have_image_parts(api_kwargs: dict) -> bool: @@ -7089,265 +2540,14 @@ class AIAgent: self._close_openai_client(client, reason=reason, shared=False) def _run_codex_stream(self, api_kwargs: dict, client: Any = None, on_first_delta: callable = None): - """Execute one streaming Responses API request and return the final response.""" - import httpx as _httpx - - active_client = client or self._ensure_primary_openai_client(reason="codex_stream_direct") - max_stream_retries = 1 - has_tool_calls = False - first_delta_fired = False - # Accumulate streamed text so we can recover if get_final_response() - # returns empty output (e.g. chatgpt.com backend-api sends - # response.incomplete instead of response.completed). - self._codex_streamed_text_parts: list = [] - for attempt in range(max_stream_retries + 1): - if self._interrupt_requested: - raise InterruptedError("Agent interrupted before Codex stream retry") - collected_output_items: list = [] - try: - with active_client.responses.stream(**api_kwargs) as stream: - for event in stream: - self._touch_activity("receiving stream response") - if self._interrupt_requested: - break - event_type = getattr(event, "type", "") - # Fire callbacks on text content deltas (suppress during tool calls) - if "output_text.delta" in event_type or event_type == "response.output_text.delta": - delta_text = getattr(event, "delta", "") - if delta_text: - self._codex_streamed_text_parts.append(delta_text) - if delta_text and not has_tool_calls: - if not first_delta_fired: - first_delta_fired = True - if on_first_delta: - try: - on_first_delta() - except Exception: - pass - self._fire_stream_delta(delta_text) - # Track tool calls to suppress text streaming - elif "function_call" in event_type: - has_tool_calls = True - # Fire reasoning callbacks - elif "reasoning" in event_type and "delta" in event_type: - reasoning_text = getattr(event, "delta", "") - if reasoning_text: - self._fire_reasoning_delta(reasoning_text) - # Collect completed output items — some backends - # (chatgpt.com/backend-api/codex) stream valid items - # via response.output_item.done but the SDK's - # get_final_response() returns an empty output list. - elif event_type == "response.output_item.done": - done_item = getattr(event, "item", None) - if done_item is not None: - collected_output_items.append(done_item) - # Log non-completed terminal events for diagnostics - elif event_type in {"response.incomplete", "response.failed"}: - resp_obj = getattr(event, "response", None) - status = getattr(resp_obj, "status", None) if resp_obj else None - incomplete_details = getattr(resp_obj, "incomplete_details", None) if resp_obj else None - logger.warning( - "Codex Responses stream received terminal event %s " - "(status=%s, incomplete_details=%s, streamed_chars=%d). %s", - event_type, status, incomplete_details, - sum(len(p) for p in self._codex_streamed_text_parts), - self._client_log_context(), - ) - final_response = stream.get_final_response() - # PATCH: ChatGPT Codex backend streams valid output items - # but get_final_response() can return an empty output list. - # Backfill from collected items or synthesize from deltas. - _out = getattr(final_response, "output", None) - if isinstance(_out, list) and not _out: - if collected_output_items: - final_response.output = list(collected_output_items) - logger.debug( - "Codex stream: backfilled %d output items from stream events", - len(collected_output_items), - ) - elif self._codex_streamed_text_parts and not has_tool_calls: - assembled = "".join(self._codex_streamed_text_parts) - final_response.output = [SimpleNamespace( - type="message", - role="assistant", - status="completed", - content=[SimpleNamespace(type="output_text", text=assembled)], - )] - logger.debug( - "Codex stream: synthesized output from %d text deltas (%d chars)", - len(self._codex_streamed_text_parts), len(assembled), - ) - return final_response - except (_httpx.RemoteProtocolError, _httpx.ReadTimeout, _httpx.ConnectError, ConnectionError) as exc: - if attempt < max_stream_retries: - logger.debug( - "Codex Responses stream transport failed (attempt %s/%s); retrying. %s error=%s", - attempt + 1, - max_stream_retries + 1, - self._client_log_context(), - exc, - ) - continue - logger.debug( - "Codex Responses stream transport failed; falling back to create(stream=True). %s error=%s", - self._client_log_context(), - exc, - ) - return self._run_codex_create_stream_fallback(api_kwargs, client=active_client) - except RuntimeError as exc: - err_text = str(exc) - missing_completed = "response.completed" in err_text - # The OpenAI SDK's Responses streaming state machine raises - # ``RuntimeError("Expected to have received `response.created` - # before ``")`` when the first SSE event from the - # server is anything other than ``response.created`` — and it - # discards the event's payload before we can read it. Three - # real-world backends emit a different first frame: - # - # * xAI on grok-4.x OAuth — sends ``error`` (issues - # reported around the May 2026 SuperGrok rollout when - # multi-turn conversations replay encrypted reasoning - # content the OAuth tier rejects) - # * codex-lb relays — send ``codex.rate_limits`` (#14634) - # * custom Responses relays — send ``response.in_progress`` - # (#8133) - # - # In all three cases the underlying byte stream is still - # readable: a non-stream ``responses.create(stream=True)`` - # fallback succeeds and surfaces the real provider error as - # a normal exception with body+status_code attached, which - # ``_summarize_api_error`` can then translate into a useful - # user-facing line. Treat ``response.created`` prelude - # errors the same way we already treat ``response.completed`` - # postlude errors. - prelude_error = ( - "Expected to have received `response.created`" in err_text - or "Expected to have received \"response.created\"" in err_text - ) - if (missing_completed or prelude_error) and attempt < max_stream_retries: - logger.debug( - "Responses stream %s (attempt %s/%s); retrying. %s", - "prelude rejected" if prelude_error else "closed before completion", - attempt + 1, - max_stream_retries + 1, - self._client_log_context(), - ) - continue - if missing_completed or prelude_error: - logger.debug( - "Responses stream %s; falling back to create(stream=True). %s err=%s", - "rejected before response.created" if prelude_error else "did not emit response.completed", - self._client_log_context(), - err_text, - ) - return self._run_codex_create_stream_fallback(api_kwargs, client=active_client) - raise + """Forwarder — see ``agent.codex_runtime.run_codex_stream``.""" + from agent.codex_runtime import run_codex_stream + return run_codex_stream(self, api_kwargs, client, on_first_delta) def _run_codex_create_stream_fallback(self, api_kwargs: dict, client: Any = None): - """Fallback path for stream completion edge cases on Codex-style Responses backends.""" - active_client = client or self._ensure_primary_openai_client(reason="codex_create_stream_fallback") - fallback_kwargs = dict(api_kwargs) - fallback_kwargs["stream"] = True - fallback_kwargs = self._get_transport().preflight_kwargs(fallback_kwargs, allow_stream=True) - stream_or_response = active_client.responses.create(**fallback_kwargs) - - # Compatibility shim for mocks or providers that still return a concrete response. - if hasattr(stream_or_response, "output"): - return stream_or_response - if not hasattr(stream_or_response, "__iter__"): - return stream_or_response - - terminal_response = None - collected_output_items: list = [] - collected_text_deltas: list = [] - try: - for event in stream_or_response: - self._touch_activity("receiving stream response") - event_type = getattr(event, "type", None) - if not event_type and isinstance(event, dict): - event_type = event.get("type") - - # ``error`` SSE frames carry the provider's real failure - # reason (subscription / quota / model-not-available / - # rejected-reasoning-replay) but never appear in the - # ``{completed, incomplete, failed}`` terminal set, so the - # raw loop below would silently consume them and end with - # "did not emit a terminal response". xAI in particular - # emits ``type=error`` as the FIRST frame for OAuth - # accounts whose Grok subscription is missing/exhausted — - # the SDK's stream helper raises ``RuntimeError(Expected - # to have received response.created before error)`` which - # the caller catches and routes here, expecting this - # fallback to surface the message. Synthesize an - # APIError-shaped exception so ``_summarize_api_error`` - # and the credential-pool entitlement detector see the - # real text instead of a generic RuntimeError. - if event_type == "error": - err_message = getattr(event, "message", None) - if not err_message and isinstance(event, dict): - err_message = event.get("message") - err_code = getattr(event, "code", None) - if not err_code and isinstance(event, dict): - err_code = event.get("code") - err_param = getattr(event, "param", None) - if not err_param and isinstance(event, dict): - err_param = event.get("param") - err_message = (err_message or "stream emitted error event").strip() - raise _StreamErrorEvent(err_message, code=err_code, param=err_param) - - # Collect output items and text deltas for backfill - if event_type == "response.output_item.done": - done_item = getattr(event, "item", None) - if done_item is None and isinstance(event, dict): - done_item = event.get("item") - if done_item is not None: - collected_output_items.append(done_item) - elif event_type in {"response.output_text.delta",}: - delta = getattr(event, "delta", "") - if not delta and isinstance(event, dict): - delta = event.get("delta", "") - if delta: - collected_text_deltas.append(delta) - - if event_type not in {"response.completed", "response.incomplete", "response.failed"}: - continue - - terminal_response = getattr(event, "response", None) - if terminal_response is None and isinstance(event, dict): - terminal_response = event.get("response") - if terminal_response is not None: - # Backfill empty output from collected stream events - _out = getattr(terminal_response, "output", None) - if isinstance(_out, list) and not _out: - if collected_output_items: - terminal_response.output = list(collected_output_items) - logger.debug( - "Codex fallback stream: backfilled %d output items", - len(collected_output_items), - ) - elif collected_text_deltas: - assembled = "".join(collected_text_deltas) - terminal_response.output = [SimpleNamespace( - type="message", role="assistant", - status="completed", - content=[SimpleNamespace(type="output_text", text=assembled)], - )] - logger.debug( - "Codex fallback stream: synthesized from %d deltas (%d chars)", - len(collected_text_deltas), len(assembled), - ) - return terminal_response - finally: - close_fn = getattr(stream_or_response, "close", None) - if callable(close_fn): - try: - close_fn() - except Exception: - pass - - if terminal_response is not None: - return terminal_response - raise RuntimeError("Responses create(stream=True) fallback did not emit a terminal response.") + """Forwarder — see ``agent.codex_runtime.run_codex_create_stream_fallback``.""" + from agent.codex_runtime import run_codex_create_stream_fallback + return run_codex_create_stream_fallback(self, api_kwargs, client) def _try_refresh_codex_client_credentials(self, *, force: bool = True) -> bool: if self.api_mode != "codex_responses" or self.provider not in {"openai-codex", "xai-oauth"}: @@ -7625,107 +2825,9 @@ class AIAgent: classified_reason: Optional[FailoverReason] = None, error_context: Optional[Dict[str, Any]] = None, ) -> tuple[bool, bool]: - """Attempt credential recovery via pool rotation. - - Returns (recovered, has_retried_429). - On rate limits: first occurrence retries same credential (sets flag True). - second consecutive failure rotates to next credential. - On billing exhaustion: immediately rotates. - On auth failures: attempts token refresh before rotating. - - `classified_reason` lets the recovery path honor the structured error - classifier instead of relying only on raw HTTP codes. This matters for - providers that surface billing/rate-limit/auth conditions under a - different status code, such as Anthropic returning HTTP 400 for - "out of extra usage". - """ - pool = self._credential_pool - if pool is None: - return False, has_retried_429 - - effective_reason = classified_reason - if effective_reason is None: - if status_code == 402: - effective_reason = FailoverReason.billing - elif status_code == 429: - effective_reason = FailoverReason.rate_limit - elif status_code in {401, 403}: - effective_reason = FailoverReason.auth - - if effective_reason == FailoverReason.billing: - rotate_status = status_code if status_code is not None else 402 - next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context) - if next_entry is not None: - logger.info( - "Credential %s (billing) — rotated to pool entry %s", - rotate_status, - getattr(next_entry, "id", "?"), - ) - self._swap_credential(next_entry) - return True, False - return False, has_retried_429 - - if effective_reason == FailoverReason.rate_limit: - usage_limit_reached = False - if error_context: - context_reason = str(error_context.get("reason") or "").lower() - context_message = str(error_context.get("message") or "").lower() - usage_limit_reached = ( - "usage_limit_reached" in context_reason - or "usage limit has been reached" in context_message - ) - if not has_retried_429 and not usage_limit_reached: - return False, True - rotate_status = status_code if status_code is not None else 429 - next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context) - if next_entry is not None: - logger.info( - "Credential %s (rate limit) — rotated to pool entry %s", - rotate_status, - getattr(next_entry, "id", "?"), - ) - self._swap_credential(next_entry) - return True, False - return False, True - - if effective_reason == FailoverReason.auth: - # Subscription/entitlement 403s look like auth failures on the - # wire but refresh cannot fix them — the OAuth token is - # already valid; the account simply lacks the entitlement - # (e.g. xAI OAuth without SuperGrok/X Premium for grok-4.3). - # Without this guard, ``try_refresh_current()`` keeps minting - # fresh tokens against the same unsubscribed account and the - # main agent loop spins re-issuing the same 403 until the - # user Ctrl+C's. Surface the error instead so the friendly - # entitlement hint from ``_summarize_api_error`` can land. - if self._is_entitlement_failure(error_context, status_code): - logger.info( - "Credential %s — entitlement-shaped 403 from %s; " - "skipping pool refresh (account lacks subscription, " - "not a transient auth failure).", - status_code if status_code is not None else "auth", - self.provider or "provider", - ) - return False, has_retried_429 - refreshed = pool.try_refresh_current() - if refreshed is not None: - logger.info(f"Credential auth failure — refreshed pool entry {getattr(refreshed, 'id', '?')}") - self._swap_credential(refreshed) - return True, has_retried_429 - # Refresh failed — rotate to next credential instead of giving up. - # The failed entry is already marked exhausted by try_refresh_current(). - rotate_status = status_code if status_code is not None else 401 - next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context) - if next_entry is not None: - logger.info( - "Credential %s (auth refresh failed) — rotated to pool entry %s", - rotate_status, - getattr(next_entry, "id", "?"), - ) - self._swap_credential(next_entry) - return True, False - - return False, has_retried_429 + """Forwarder — see ``agent.agent_runtime_helpers.recover_with_credential_pool``.""" + from agent.agent_runtime_helpers import recover_with_credential_pool + return recover_with_credential_pool(self, status_code=status_code, has_retried_429=has_retried_429, classified_reason=classified_reason, error_context=error_context) def _credential_pool_may_recover_rate_limit(self) -> bool: """Whether a rate-limit retry should wait for same-provider credentials.""" @@ -7774,156 +2876,9 @@ class AIAgent: ) def _interruptible_api_call(self, api_kwargs: dict): - """ - Run the API call in a background thread so the main conversation loop - can detect interrupts without waiting for the full HTTP round-trip. - - Each worker thread gets its own OpenAI client instance. Interrupts only - close that worker-local client, so retries and other requests never - inherit a closed transport. - - Includes a stale-call detector: if no response arrives within the - configured timeout, the connection is killed and an error raised so - the main retry loop can try again with backoff / credential rotation / - provider fallback. - """ - result = {"response": None, "error": None} - request_client_holder = {"client": None} - - def _call(): - try: - if self.api_mode == "codex_responses": - request_client_holder["client"] = self._create_request_openai_client( - reason="codex_stream_request", - api_kwargs=api_kwargs, - ) - result["response"] = self._run_codex_stream( - api_kwargs, - client=request_client_holder["client"], - on_first_delta=getattr(self, "_codex_on_first_delta", None), - ) - elif self.api_mode == "anthropic_messages": - result["response"] = self._anthropic_messages_create(api_kwargs) - elif self.api_mode == "bedrock_converse": - # Bedrock uses boto3 directly — no OpenAI client needed. - # normalize_converse_response produces an OpenAI-compatible - # SimpleNamespace so the rest of the agent loop can treat - # bedrock responses like chat_completions responses. - from agent.bedrock_adapter import ( - _get_bedrock_runtime_client, - invalidate_runtime_client, - is_stale_connection_error, - normalize_converse_response, - ) - region = api_kwargs.pop("__bedrock_region__", "us-east-1") - api_kwargs.pop("__bedrock_converse__", None) - client = _get_bedrock_runtime_client(region) - try: - raw_response = client.converse(**api_kwargs) - except Exception as _bedrock_exc: - # Evict the cached client on stale-connection failures - # so the outer retry loop builds a fresh client/pool. - if is_stale_connection_error(_bedrock_exc): - invalidate_runtime_client(region) - raise - result["response"] = normalize_converse_response(raw_response) - else: - request_client_holder["client"] = self._create_request_openai_client( - reason="chat_completion_request", - api_kwargs=api_kwargs, - ) - result["response"] = request_client_holder["client"].chat.completions.create(**api_kwargs) - except Exception as e: - result["error"] = e - finally: - request_client = request_client_holder.get("client") - if request_client is not None: - self._close_request_openai_client(request_client, reason="request_complete") - - # ── Stale-call timeout (mirrors streaming stale detector) ──────── - # Non-streaming calls return nothing until the full response is - # ready. Without this, a hung provider can block for the full - # httpx timeout (default 1800s) with zero feedback. The stale - # detector kills the connection early so the main retry loop can - # apply richer recovery (credential rotation, provider fallback). - _stale_timeout = self._compute_non_stream_stale_timeout( - api_kwargs.get("messages", []) - ) - - _call_start = time.time() - self._touch_activity("waiting for non-streaming API response") - - t = threading.Thread(target=_call, daemon=True) - t.start() - _poll_count = 0 - while t.is_alive(): - t.join(timeout=0.3) - _poll_count += 1 - - # Touch activity every ~30s so the gateway's inactivity - # monitor knows we're alive while waiting for the response. - if _poll_count % 100 == 0: # 100 × 0.3s = 30s - _elapsed = time.time() - _call_start - self._touch_activity( - f"waiting for non-streaming response ({int(_elapsed)}s elapsed)" - ) - - # Stale-call detector: kill the connection if no response - # arrives within the configured timeout. - _elapsed = time.time() - _call_start - if _elapsed > _stale_timeout: - _est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4 - logger.warning( - "Non-streaming API call stale for %.0fs (threshold %.0fs). " - "model=%s context=~%s tokens. Killing connection.", - _elapsed, _stale_timeout, - api_kwargs.get("model", "unknown"), f"{_est_ctx:,}", - ) - self._emit_status( - f"⚠️ No response from provider for {int(_elapsed)}s " - f"(non-streaming, model: {api_kwargs.get('model', 'unknown')}). " - f"Aborting call." - ) - try: - if self.api_mode == "anthropic_messages": - self._anthropic_client.close() - self._rebuild_anthropic_client() - else: - rc = request_client_holder.get("client") - if rc is not None: - self._close_request_openai_client(rc, reason="stale_call_kill") - except Exception: - pass - self._touch_activity( - f"stale non-streaming call killed after {int(_elapsed)}s" - ) - # Wait briefly for the thread to notice the closed connection. - t.join(timeout=2.0) - if result["error"] is None and result["response"] is None: - result["error"] = TimeoutError( - f"Non-streaming API call timed out after {int(_elapsed)}s " - f"with no response (threshold: {int(_stale_timeout)}s)" - ) - break - - if self._interrupt_requested: - # Force-close the in-flight worker-local HTTP connection to stop - # token generation without poisoning the shared client used to - # seed future retries. - try: - if self.api_mode == "anthropic_messages": - self._anthropic_client.close() - self._rebuild_anthropic_client() - else: - request_client = request_client_holder.get("client") - if request_client is not None: - self._close_request_openai_client(request_client, reason="interrupt_abort") - except Exception: - pass - raise InterruptedError("Agent interrupted during API call") - if result["error"] is not None: - raise result["error"] - return result["response"] + """Forwarder — see ``agent.chat_completion_helpers.interruptible_api_call``.""" + from agent.chat_completion_helpers import interruptible_api_call + return interruptible_api_call(self, api_kwargs) # ── Unified streaming API call ───────────────────────────────────────── @@ -8094,1314 +3049,28 @@ class AIAgent: def _interruptible_streaming_api_call( self, api_kwargs: dict, *, on_first_delta: callable = None ): - """Streaming variant of _interruptible_api_call for real-time token delivery. - - Handles all three api_modes: - - chat_completions: stream=True on OpenAI-compatible endpoints - - anthropic_messages: client.messages.stream() via Anthropic SDK - - codex_responses: delegates to _run_codex_stream (already streaming) - - Fires stream_delta_callback and _stream_callback for each text token. - Tool-call turns suppress the callback — only text-only final responses - stream to the consumer. Returns a SimpleNamespace that mimics the - non-streaming response shape so the rest of the agent loop is unchanged. - - Falls back to _interruptible_api_call on provider errors indicating - streaming is not supported. - """ - if self._interrupt_requested: - raise InterruptedError("Agent interrupted before streaming API call") - - if self.api_mode == "codex_responses": - # Codex streams internally via _run_codex_stream. The main dispatch - # in _interruptible_api_call already calls it; we just need to - # ensure on_first_delta reaches it. Store it on the instance - # temporarily so _run_codex_stream can pick it up. - self._codex_on_first_delta = on_first_delta - try: - return self._interruptible_api_call(api_kwargs) - finally: - self._codex_on_first_delta = None - - # Bedrock Converse uses boto3's converse_stream() with real-time delta - # callbacks — same UX as Anthropic and chat_completions streaming. - if self.api_mode == "bedrock_converse": - result = {"response": None, "error": None} - first_delta_fired = {"done": False} - deltas_were_sent = {"yes": False} - - def _fire_first(): - if not first_delta_fired["done"] and on_first_delta: - first_delta_fired["done"] = True - try: - on_first_delta() - except Exception: - pass - - def _bedrock_call(): - try: - from agent.bedrock_adapter import ( - _get_bedrock_runtime_client, - invalidate_runtime_client, - is_stale_connection_error, - stream_converse_with_callbacks, - ) - region = api_kwargs.pop("__bedrock_region__", "us-east-1") - api_kwargs.pop("__bedrock_converse__", None) - client = _get_bedrock_runtime_client(region) - try: - raw_response = client.converse_stream(**api_kwargs) - except Exception as _bedrock_exc: - # Evict the cached client on stale-connection failures - # so the outer retry loop builds a fresh client/pool. - if is_stale_connection_error(_bedrock_exc): - invalidate_runtime_client(region) - raise - - def _on_text(text): - _fire_first() - self._fire_stream_delta(text) - deltas_were_sent["yes"] = True - - def _on_tool(name): - _fire_first() - self._fire_tool_gen_started(name) - - def _on_reasoning(text): - _fire_first() - self._fire_reasoning_delta(text) - - result["response"] = stream_converse_with_callbacks( - raw_response, - on_text_delta=_on_text if self._has_stream_consumers() else None, - on_tool_start=_on_tool, - on_reasoning_delta=_on_reasoning if self.reasoning_callback or self.stream_delta_callback else None, - on_interrupt_check=lambda: self._interrupt_requested, - ) - except Exception as e: - result["error"] = e - - t = threading.Thread(target=_bedrock_call, daemon=True) - t.start() - while t.is_alive(): - t.join(timeout=0.3) - if self._interrupt_requested: - raise InterruptedError("Agent interrupted during Bedrock API call") - if result["error"] is not None: - raise result["error"] - return result["response"] - - result = {"response": None, "error": None, "partial_tool_names": []} - request_client_holder = {"client": None, "diag": None} - first_delta_fired = {"done": False} - deltas_were_sent = {"yes": False} # Track if any deltas were fired (for fallback) - # Wall-clock timestamp of the last real streaming chunk. The outer - # poll loop uses this to detect stale connections that keep receiving - # SSE keep-alive pings but no actual data. - last_chunk_time = {"t": time.time()} - - def _fire_first_delta(): - if not first_delta_fired["done"] and on_first_delta: - first_delta_fired["done"] = True - try: - on_first_delta() - except Exception: - pass - - def _call_chat_completions(): - """Stream a chat completions response.""" - import httpx as _httpx - # Per-provider / per-model request_timeout_seconds (from config.yaml) - # wins over the HERMES_API_TIMEOUT env default if the user set it. - _provider_timeout_cfg = get_provider_request_timeout(self.provider, self.model) - _base_timeout = ( - _provider_timeout_cfg - if _provider_timeout_cfg is not None - else float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) - ) - # Read timeout: config wins here too. Otherwise use - # HERMES_STREAM_READ_TIMEOUT (default 120s) for cloud providers. - if _provider_timeout_cfg is not None: - _stream_read_timeout = _provider_timeout_cfg - else: - _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0)) - # Local providers (Ollama, llama.cpp, vLLM) can take minutes for - # prefill on large contexts before producing the first token. - # Auto-increase the httpx read timeout unless the user explicitly - # overrode HERMES_STREAM_READ_TIMEOUT. - if _stream_read_timeout == 120.0 and self.base_url and is_local_endpoint(self.base_url): - _stream_read_timeout = _base_timeout - logger.debug( - "Local provider detected (%s) — stream read timeout raised to %.0fs", - self.base_url, _stream_read_timeout, - ) - stream_kwargs = { - **api_kwargs, - "stream": True, - "stream_options": {"include_usage": True}, - "timeout": _httpx.Timeout( - connect=30.0, - read=_stream_read_timeout, - write=_base_timeout, - pool=30.0, - ), - } - request_client_holder["client"] = self._create_request_openai_client( - reason="chat_completion_stream_request", - api_kwargs=stream_kwargs, - ) - # Reset stale-stream timer so the detector measures from this - # attempt's start, not a previous attempt's last chunk. - last_chunk_time["t"] = time.time() - self._touch_activity("waiting for provider response (streaming)") - # Initialize per-attempt stream diagnostics so the retry block can - # reach for them after the stream dies. Lives on - # ``request_client_holder["diag"]`` for closure access. - _diag = self._stream_diag_init() - request_client_holder["diag"] = _diag - stream = request_client_holder["client"].chat.completions.create(**stream_kwargs) - - # Capture rate limit headers from the initial HTTP response. - # The OpenAI SDK Stream object exposes the underlying httpx - # response via .response before any chunks are consumed. - self._capture_rate_limits(getattr(stream, "response", None)) - # Snapshot diagnostic headers (cf-ray, x-openrouter-provider, etc.) - # so they survive even when the stream dies before any chunk - # arrives. Best-effort; never raises. - self._stream_diag_capture_response(_diag, getattr(stream, "response", None)) - - # Log OpenRouter response cache status when present. - self._check_openrouter_cache_status(getattr(stream, "response", None)) - - content_parts: list = [] - tool_calls_acc: dict = {} - tool_gen_notified: set = set() - # Ollama-compatible endpoints reuse index 0 for every tool call - # in a parallel batch, distinguishing them only by id. Track - # the last seen id per raw index so we can detect a new tool - # call starting at the same index and redirect it to a fresh slot. - _last_id_at_idx: dict = {} # raw_index -> last seen non-empty id - _active_slot_by_idx: dict = {} # raw_index -> current slot in tool_calls_acc - finish_reason = None - model_name = None - role = "assistant" - reasoning_parts: list = [] - usage_obj = None - for chunk in stream: - last_chunk_time["t"] = time.time() - self._touch_activity("receiving stream response") - - # Update per-attempt diagnostic counters. Best-effort — - # failures are swallowed so the streaming hot path is never - # interrupted by diagnostic accounting. - try: - _diag["chunks"] = int(_diag.get("chunks", 0)) + 1 - if _diag.get("first_chunk_at") is None: - _diag["first_chunk_at"] = last_chunk_time["t"] - # Approximate byte size from the chunk's repr — exact wire - # bytes aren't exposed by the SDK, but len(repr(chunk)) is - # a stable proxy for "how much content arrived" that - # survives stub provider differences. - try: - _diag["bytes"] = int(_diag.get("bytes", 0)) + len(repr(chunk)) - except Exception: - pass - except Exception: - pass - - if self._interrupt_requested: - break - - if not chunk.choices: - if hasattr(chunk, "model") and chunk.model: - model_name = chunk.model - # Usage comes in the final chunk with empty choices - if hasattr(chunk, "usage") and chunk.usage: - usage_obj = chunk.usage - continue - - delta = chunk.choices[0].delta - if hasattr(chunk, "model") and chunk.model: - model_name = chunk.model - - # Accumulate reasoning content - reasoning_text = getattr(delta, "reasoning_content", None) or getattr(delta, "reasoning", None) - if reasoning_text: - reasoning_parts.append(reasoning_text) - _fire_first_delta() - self._fire_reasoning_delta(reasoning_text) - - # Accumulate text content — fire callback only when no tool calls - if delta and delta.content: - content_parts.append(delta.content) - if not tool_calls_acc: - _fire_first_delta() - self._fire_stream_delta(delta.content) - deltas_were_sent["yes"] = True - # Tool calls suppress regular content streaming (avoids - # displaying chatty "I'll use the tool..." text alongside - # tool calls). But reasoning tags embedded in suppressed - # content should still reach the display — otherwise the - # reasoning box only appears as a post-response fallback, - # rendering it confusingly after the already-streamed - # response. Route suppressed content through the stream - # delta callback so its tag extraction can fire the - # reasoning display. Non-reasoning text is harmlessly - # suppressed by the CLI's _stream_delta when the stream - # box is already closed (tool boundary flush). - elif self.stream_delta_callback: - try: - self.stream_delta_callback(delta.content) - self._record_streamed_assistant_text(delta.content) - except Exception: - pass - - # Accumulate tool call deltas — notify display on first name - if delta and delta.tool_calls: - for tc_delta in delta.tool_calls: - raw_idx = tc_delta.index if tc_delta.index is not None else 0 - delta_id = tc_delta.id or "" - - # Ollama fix: detect a new tool call reusing the same - # raw index (different id) and redirect to a fresh slot. - if raw_idx not in _active_slot_by_idx: - _active_slot_by_idx[raw_idx] = raw_idx - if ( - delta_id - and raw_idx in _last_id_at_idx - and delta_id != _last_id_at_idx[raw_idx] - ): - new_slot = max(tool_calls_acc, default=-1) + 1 - _active_slot_by_idx[raw_idx] = new_slot - if delta_id: - _last_id_at_idx[raw_idx] = delta_id - idx = _active_slot_by_idx[raw_idx] - - if idx not in tool_calls_acc: - tool_calls_acc[idx] = { - "id": tc_delta.id or "", - "type": "function", - "function": {"name": "", "arguments": ""}, - "extra_content": None, - } - entry = tool_calls_acc[idx] - if tc_delta.id: - entry["id"] = tc_delta.id - if tc_delta.function: - if tc_delta.function.name: - # Use assignment, not +=. Function names are - # atomic identifiers delivered complete in the - # first chunk (OpenAI spec). Some providers - # (MiniMax M2.7 via NVIDIA NIM) resend the full - # name in every chunk; concatenation would - # produce "read_fileread_file". Assignment - # (matching the OpenAI Node SDK / LiteLLM / - # Vercel AI patterns) is immune to this. - entry["function"]["name"] = tc_delta.function.name - if tc_delta.function.arguments: - entry["function"]["arguments"] += tc_delta.function.arguments - extra = getattr(tc_delta, "extra_content", None) - if extra is None and hasattr(tc_delta, "model_extra"): - extra = (tc_delta.model_extra or {}).get("extra_content") - if extra is not None: - if hasattr(extra, "model_dump"): - extra = extra.model_dump() - entry["extra_content"] = extra - # Fire once per tool when the full name is available - name = entry["function"]["name"] - if name and idx not in tool_gen_notified: - tool_gen_notified.add(idx) - _fire_first_delta() - self._fire_tool_gen_started(name) - # Record the partial tool-call name so the outer - # stub-builder can surface a user-visible warning - # if streaming dies before this tool's arguments - # are fully delivered. Without this, a stall - # during tool-call JSON generation lets the stub - # at line ~6107 return `tool_calls=None`, silently - # discarding the attempted action. - result["partial_tool_names"].append(name) - - if chunk.choices[0].finish_reason: - finish_reason = chunk.choices[0].finish_reason - - # Usage in the final chunk - if hasattr(chunk, "usage") and chunk.usage: - usage_obj = chunk.usage - - # Build mock response matching non-streaming shape - full_content = "".join(content_parts) or None - mock_tool_calls = None - has_truncated_tool_args = False - if tool_calls_acc: - mock_tool_calls = [] - for idx in sorted(tool_calls_acc): - tc = tool_calls_acc[idx] - arguments = tc["function"]["arguments"] - tool_name = tc["function"]["name"] or "?" - if arguments and arguments.strip(): - try: - json.loads(arguments) - except json.JSONDecodeError: - # Attempt repair before flagging as truncated. - # Models like GLM-5.1 via Ollama produce trailing - # commas, unclosed brackets, Python None, etc. - # Without repair, these hit the truncation handler - # and kill the session. _repair_tool_call_arguments - # returns "{}" for unrepairable args, which is far - # better than a crashed session. - repaired = _repair_tool_call_arguments(arguments, tool_name) - if repaired != "{}": - # Successfully repaired — use the fixed args - arguments = repaired - else: - # Unrepairable — flag for truncation handling - has_truncated_tool_args = True - mock_tool_calls.append(SimpleNamespace( - id=tc["id"], - type=tc["type"], - extra_content=tc.get("extra_content"), - function=SimpleNamespace( - name=tc["function"]["name"], - arguments=arguments, - ), - )) - - effective_finish_reason = finish_reason or "stop" - if has_truncated_tool_args: - effective_finish_reason = "length" - - full_reasoning = "".join(reasoning_parts) or None - mock_message = SimpleNamespace( - role=role, - content=full_content, - tool_calls=mock_tool_calls, - reasoning_content=full_reasoning, - ) - mock_choice = SimpleNamespace( - index=0, - message=mock_message, - finish_reason=effective_finish_reason, - ) - return SimpleNamespace( - id="stream-" + str(uuid.uuid4()), - model=model_name, - choices=[mock_choice], - usage=usage_obj, - ) - - def _call_anthropic(): - """Stream an Anthropic Messages API response. - - Fires delta callbacks for real-time token delivery, but returns - the native Anthropic Message object from get_final_message() so - the rest of the agent loop (validation, tool extraction, etc.) - works unchanged. - """ - has_tool_use = False - - # Reset stale-stream timer for this attempt - last_chunk_time["t"] = time.time() - # Per-attempt diagnostic dict for the retry block to consume. - _diag = self._stream_diag_init() - request_client_holder["diag"] = _diag - # Use the Anthropic SDK's streaming context manager - with self._anthropic_client.messages.stream(**api_kwargs) as stream: - # The Anthropic SDK exposes the raw httpx response on - # ``stream.response``. Snapshot diagnostic headers - # immediately so they survive a stream that dies before the - # first event. - try: - self._stream_diag_capture_response( - _diag, getattr(stream, "response", None) - ) - except Exception: - pass - for event in stream: - # Update stale-stream timer on every event so the - # outer poll loop knows data is flowing. Without - # this, the detector kills healthy long-running - # Opus streams after 180 s even when events are - # actively arriving (the chat_completions path - # already does this at the top of its chunk loop). - last_chunk_time["t"] = time.time() - self._touch_activity("receiving stream response") - - # Update per-attempt diagnostic counters (best-effort). - try: - _diag["chunks"] = int(_diag.get("chunks", 0)) + 1 - if _diag.get("first_chunk_at") is None: - _diag["first_chunk_at"] = last_chunk_time["t"] - try: - _diag["bytes"] = int(_diag.get("bytes", 0)) + len(repr(event)) - except Exception: - pass - except Exception: - pass - - if self._interrupt_requested: - break - - event_type = getattr(event, "type", None) - - if event_type == "content_block_start": - block = getattr(event, "content_block", None) - if block and getattr(block, "type", None) == "tool_use": - has_tool_use = True - tool_name = getattr(block, "name", None) - if tool_name: - _fire_first_delta() - self._fire_tool_gen_started(tool_name) - - elif event_type == "content_block_delta": - delta = getattr(event, "delta", None) - if delta: - delta_type = getattr(delta, "type", None) - if delta_type == "text_delta": - text = getattr(delta, "text", "") - if text and not has_tool_use: - _fire_first_delta() - self._fire_stream_delta(text) - deltas_were_sent["yes"] = True - elif delta_type == "thinking_delta": - thinking_text = getattr(delta, "thinking", "") - if thinking_text: - _fire_first_delta() - self._fire_reasoning_delta(thinking_text) - - # Return the native Anthropic Message for downstream processing - return stream.get_final_message() - - def _call(): - import httpx as _httpx - - _max_stream_retries = int(os.getenv("HERMES_STREAM_RETRIES", 2)) - - try: - for _stream_attempt in range(_max_stream_retries + 1): - # Check for interrupt before each retry attempt. Without - # this, /stop closes the HTTP connection (outer poll loop), - # but the retry loop opens a FRESH connection — negating the - # interrupt entirely. On slow providers (ollama-cloud) each - # retry can block for the full stream-read timeout (120s+), - # causing multi-minute delays between /stop and response. - if self._interrupt_requested: - raise InterruptedError("Agent interrupted before stream retry") - try: - if self.api_mode == "anthropic_messages": - self._try_refresh_anthropic_client_credentials() - result["response"] = _call_anthropic() - else: - result["response"] = _call_chat_completions() - return # success - except Exception as e: - _is_timeout = isinstance( - e, (_httpx.ReadTimeout, _httpx.ConnectTimeout, _httpx.PoolTimeout) - ) - _is_conn_err = isinstance( - e, (_httpx.ConnectError, _httpx.RemoteProtocolError, ConnectionError) - ) - _is_stream_parse_err = self._is_provider_stream_parse_error(e) - - # If the stream died AFTER some tokens were delivered: - # normally we don't retry (the user already saw text, - # retrying would duplicate it). BUT: if a tool call - # was in-flight when the stream died, silently aborting - # discards the tool call entirely. In that case we - # prefer to retry — the user sees a brief - # "reconnecting" marker + duplicated preamble text, - # which is strictly better than a failed action with - # a "retry manually" message. Limit this to transient - # connection errors (Clawdbot-style narrow gate): no - # tool has executed yet within this API call, so - # silent retry is safe wrt side-effects. - if deltas_were_sent["yes"]: - _partial_tool_in_flight = bool( - result.get("partial_tool_names") - ) - _is_sse_conn_err_preview = False - if not _is_timeout and not _is_conn_err: - from openai import APIError as _APIError - if isinstance(e, _APIError) and not getattr(e, "status_code", None): - _err_lower_preview = str(e).lower() - _SSE_PREVIEW_PHRASES = ( - "connection lost", - "connection reset", - "connection closed", - "connection terminated", - "network error", - "network connection", - "terminated", - "peer closed", - "broken pipe", - "upstream connect error", - ) - _is_sse_conn_err_preview = any( - phrase in _err_lower_preview - for phrase in _SSE_PREVIEW_PHRASES - ) - _is_transient = ( - _is_timeout - or _is_conn_err - or _is_sse_conn_err_preview - or _is_stream_parse_err - ) - _can_silent_retry = ( - _partial_tool_in_flight - and _is_transient - and _stream_attempt < _max_stream_retries - ) - if not _can_silent_retry: - # Either no tool call was in-flight (so the - # turn was a pure text response — current - # stub-with-recovered-text behaviour is - # correct), or retries are exhausted, or the - # error isn't transient. Fall through to the - # stub path. - logger.warning( - "Streaming failed after partial delivery, not retrying: %s", e - ) - result["error"] = e - return - # Tool call was in-flight AND error is transient: - # retry silently. Clear per-attempt state so the - # next stream starts clean. Fire a "reconnecting" - # marker so the user sees why the preamble is - # about to be re-streamed. Structured WARNING is - # emitted by ``_emit_stream_drop`` below; no - # additional INFO line needed. - try: - self._fire_stream_delta( - "\n\n⚠ Connection dropped mid tool-call; " - "reconnecting…\n\n" - ) - except Exception: - pass - # Reset the streamed-text buffer so the retry's - # fresh preamble doesn't get double-recorded in - # _current_streamed_assistant_text (which would - # pollute the interim-visible-text comparison). - try: - self._reset_stream_delivery_tracking() - except Exception: - pass - # Reset in-memory accumulators so the next - # attempt's chunks don't concat onto the dead - # stream's partial JSON. - result["partial_tool_names"] = [] - deltas_were_sent["yes"] = False - first_delta_fired["done"] = False - self._emit_stream_drop( - error=e, - attempt=_stream_attempt + 2, - max_attempts=_max_stream_retries + 1, - mid_tool_call=True, - diag=request_client_holder.get("diag"), - ) - stale = request_client_holder.get("client") - if stale is not None: - self._close_request_openai_client( - stale, reason="stream_mid_tool_retry_cleanup" - ) - request_client_holder["client"] = None - try: - self._replace_primary_openai_client( - reason="stream_mid_tool_retry_pool_cleanup" - ) - except Exception: - pass - continue - - # SSE error events from proxies (e.g. OpenRouter sends - # {"error":{"message":"Network connection lost."}}) are - # raised as APIError by the OpenAI SDK. These are - # semantically identical to httpx connection drops — - # the upstream stream died — and should be retried with - # a fresh connection. Distinguish from HTTP errors: - # APIError from SSE has no status_code, while - # APIStatusError (4xx/5xx) always has one. - _is_sse_conn_err = False - if not _is_timeout and not _is_conn_err: - from openai import APIError as _APIError - if isinstance(e, _APIError) and not getattr(e, "status_code", None): - _err_lower_sse = str(e).lower() - _SSE_CONN_PHRASES = ( - "connection lost", - "connection reset", - "connection closed", - "connection terminated", - "network error", - "network connection", - "terminated", - "peer closed", - "broken pipe", - "upstream connect error", - ) - _is_sse_conn_err = any( - phrase in _err_lower_sse - for phrase in _SSE_CONN_PHRASES - ) - - if _is_timeout or _is_conn_err or _is_sse_conn_err or _is_stream_parse_err: - # Transient network / timeout error. Retry the - # streaming request with a fresh connection first. - if _stream_attempt < _max_stream_retries: - self._emit_stream_drop( - error=e, - attempt=_stream_attempt + 2, - max_attempts=_max_stream_retries + 1, - mid_tool_call=False, - diag=request_client_holder.get("diag"), - ) - # Close the stale request client before retry - stale = request_client_holder.get("client") - if stale is not None: - self._close_request_openai_client( - stale, reason="stream_retry_cleanup" - ) - request_client_holder["client"] = None - # Also rebuild the primary client to purge - # any dead connections from the pool. - try: - self._replace_primary_openai_client( - reason="stream_retry_pool_cleanup" - ) - except Exception: - pass - continue - # Retries exhausted. Log the final failure with - # full diagnostic detail (chain, headers, - # bytes/elapsed) via the same helper used for - # mid-flight retries — subagent lines get the - # ``[subagent-N]`` log_prefix so the parent can - # attribute them. - self._log_stream_retry( - kind="exhausted", - error=e, - attempt=_max_stream_retries + 1, - max_attempts=_max_stream_retries + 1, - mid_tool_call=False, - diag=request_client_holder.get("diag"), - ) - if _is_stream_parse_err: - self._emit_status( - "❌ Provider returned malformed streaming data after " - f"{_max_stream_retries + 1} attempts. " - "The provider may be experiencing issues — " - "try again in a moment." - ) - else: - self._emit_status( - "❌ Connection to provider failed after " - f"{_max_stream_retries + 1} attempts. " - "The provider may be experiencing issues — " - "try again in a moment." - ) - else: - _err_lower = str(e).lower() - _is_stream_unsupported = ( - "stream" in _err_lower - and "not supported" in _err_lower - ) - if _is_stream_unsupported: - self._disable_streaming = True - self._safe_print( - "\n⚠ Streaming is not supported for this " - "model/provider. Switching to non-streaming.\n" - " To avoid this delay, set display.streaming: false " - "in config.yaml\n" - ) - logger.info( - "Streaming failed before delivery: %s", - e, - ) - - # Propagate the error to the main retry loop instead of - # falling back to non-streaming inline. The main loop has - # richer recovery: credential rotation, provider fallback, - # backoff, and — for "stream not supported" — will switch - # to non-streaming on the next attempt via _disable_streaming. - result["error"] = e - return - except InterruptedError as e: - # The interrupt may be noticed inside the worker thread before - # the polling loop sees it. Surface it through the normal result - # channel so callers never miss a fast pre-retry interrupt. - result["error"] = e - return - finally: - request_client = request_client_holder.get("client") - if request_client is not None: - self._close_request_openai_client(request_client, reason="stream_request_complete") - - _stream_stale_timeout_base = float(os.getenv("HERMES_STREAM_STALE_TIMEOUT", 180.0)) - # Local providers (Ollama, oMLX, llama-cpp) can take 300+ seconds - # for prefill on large contexts. Disable the stale detector unless - # the user explicitly set HERMES_STREAM_STALE_TIMEOUT. - if _stream_stale_timeout_base == 180.0 and self.base_url and is_local_endpoint(self.base_url): - _stream_stale_timeout = float("inf") - logger.debug("Local provider detected (%s) — stale stream timeout disabled", self.base_url) - else: - # Scale the stale timeout for large contexts: slow models (like Opus) - # can legitimately think for minutes before producing the first token - # when the context is large. Without this, the stale detector kills - # healthy connections during the model's thinking phase, producing - # spurious RemoteProtocolError ("peer closed connection"). - _est_tokens = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4 - if _est_tokens > 100_000: - _stream_stale_timeout = max(_stream_stale_timeout_base, 300.0) - elif _est_tokens > 50_000: - _stream_stale_timeout = max(_stream_stale_timeout_base, 240.0) - else: - _stream_stale_timeout = _stream_stale_timeout_base - - t = threading.Thread(target=_call, daemon=True) - t.start() - _last_heartbeat = time.time() - _HEARTBEAT_INTERVAL = 30.0 # seconds between gateway activity touches - while t.is_alive(): - t.join(timeout=0.3) - - # Periodic heartbeat: touch the agent's activity tracker so the - # gateway's inactivity monitor knows we're alive while waiting - # for stream chunks. Without this, long thinking pauses (e.g. - # reasoning models) or slow prefill on local providers (Ollama) - # trigger false inactivity timeouts. The _call thread touches - # activity on each chunk, but the gap between API call start - # and first chunk can exceed the gateway timeout — especially - # when the stale-stream timeout is disabled (local providers). - _hb_now = time.time() - if _hb_now - _last_heartbeat >= _HEARTBEAT_INTERVAL: - _last_heartbeat = _hb_now - _waiting_secs = int(_hb_now - last_chunk_time["t"]) - self._touch_activity( - f"waiting for stream response ({_waiting_secs}s, no chunks yet)" - ) - - # Detect stale streams: connections kept alive by SSE pings - # but delivering no real chunks. Kill the client so the - # inner retry loop can start a fresh connection. - _stale_elapsed = time.time() - last_chunk_time["t"] - if _stale_elapsed > _stream_stale_timeout: - _est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4 - logger.warning( - "Stream stale for %.0fs (threshold %.0fs) — no chunks received. " - "model=%s context=~%s tokens. Killing connection.", - _stale_elapsed, _stream_stale_timeout, - api_kwargs.get("model", "unknown"), f"{_est_ctx:,}", - ) - self._emit_status( - f"⚠️ No response from provider for {int(_stale_elapsed)}s " - f"(model: {api_kwargs.get('model', 'unknown')}, " - f"context: ~{_est_ctx:,} tokens). " - f"Reconnecting..." - ) - try: - rc = request_client_holder.get("client") - if rc is not None: - self._close_request_openai_client(rc, reason="stale_stream_kill") - except Exception: - pass - # Rebuild the primary client too — its connection pool - # may hold dead sockets from the same provider outage. - try: - self._replace_primary_openai_client(reason="stale_stream_pool_cleanup") - except Exception: - pass - # Reset the timer so we don't kill repeatedly while - # the inner thread processes the closure. - last_chunk_time["t"] = time.time() - self._touch_activity( - f"stale stream detected after {int(_stale_elapsed)}s, reconnecting" - ) - - if self._interrupt_requested: - try: - if self.api_mode == "anthropic_messages": - self._anthropic_client.close() - self._rebuild_anthropic_client() - else: - request_client = request_client_holder.get("client") - if request_client is not None: - self._close_request_openai_client(request_client, reason="stream_interrupt_abort") - except Exception: - pass - raise InterruptedError("Agent interrupted during streaming API call") - if result["error"] is not None: - if deltas_were_sent["yes"]: - # Streaming failed AFTER some tokens were already delivered to - # the platform. Re-raising would let the outer retry loop make - # a new API call, creating a duplicate message. Return a - # partial "stop" response instead so the outer loop treats this - # turn as complete (no retry, no fallback). - # Recover whatever content was already streamed to the user. - # _current_streamed_assistant_text accumulates text fired - # through _fire_stream_delta, so it has exactly what the - # user saw before the connection died. - _partial_text = ( - getattr(self, "_current_streamed_assistant_text", "") or "" - ).strip() or None - - # If the stream died while the model was emitting a tool call, - # the stub below will silently set `tool_calls=None` and the - # agent loop will treat the turn as complete — the attempted - # action is lost with no user-facing signal. Append a - # human-visible warning to the stub content so (a) the user - # knows something failed, and (b) the next turn's model sees - # in conversation history what was attempted and can retry. - _partial_names = list(result.get("partial_tool_names") or []) - if _partial_names: - _name_str = ", ".join(_partial_names[:3]) - if len(_partial_names) > 3: - _name_str += f", +{len(_partial_names) - 3} more" - _warn = ( - f"\n\n⚠ Stream stalled mid tool-call " - f"({_name_str}); the action was not executed. " - f"Ask me to retry if you want to continue." - ) - _partial_text = (_partial_text or "") + _warn - # Also fire as a streaming delta so the user sees it now - # instead of only in the persisted transcript. - try: - self._fire_stream_delta(_warn) - except Exception: - pass - logger.warning( - "Partial stream dropped tool call(s) %s after %s chars " - "of text; surfaced warning to user: %s", - _partial_names, len(_partial_text or ""), result["error"], - ) - else: - logger.warning( - "Partial stream delivered before error; returning stub " - "response with %s chars of recovered content to prevent " - "duplicate messages: %s", - len(_partial_text or ""), - result["error"], - ) - _stub_msg = SimpleNamespace( - role="assistant", content=_partial_text, tool_calls=None, - reasoning_content=None, - ) - return SimpleNamespace( - id="partial-stream-stub", - model=getattr(self, "model", "unknown"), - choices=[SimpleNamespace( - index=0, message=_stub_msg, finish_reason="stop", - )], - usage=None, - ) - raise result["error"] - return result["response"] - - # ── Provider fallback ────────────────────────────────────────────────── + """Forwarder — see ``agent.chat_completion_helpers.interruptible_streaming_api_call``.""" + from agent.chat_completion_helpers import interruptible_streaming_api_call + return interruptible_streaming_api_call(self, api_kwargs, on_first_delta=on_first_delta) def _try_activate_fallback(self, reason: "FailoverReason | None" = None) -> bool: - """Switch to the next fallback model/provider in the chain. - - Called when the current model is failing after retries. Swaps the - OpenAI client, model slug, and provider in-place so the retry loop - can continue with the new backend. Advances through the chain on - each call; returns False when exhausted. - - Uses the centralized provider router (resolve_provider_client) for - auth resolution and client construction — no duplicated provider→key - mappings. - """ - if reason in {FailoverReason.rate_limit, FailoverReason.billing}: - # Only start cooldown when leaving the primary provider. If we're - # already on a fallback and chain-switching, the primary wasn't the - # source of the 429 so the cooldown should not be reset/extended. - fallback_already_active = bool(getattr(self, "_fallback_activated", False)) - current_provider = (getattr(self, "provider", "") or "").strip().lower() - primary_provider = ((self._primary_runtime or {}).get("provider") or "").strip().lower() - if (not fallback_already_active) or (primary_provider and current_provider == primary_provider): - self._rate_limited_until = time.monotonic() + 60 - if self._fallback_index >= len(self._fallback_chain): - return False - - fb = self._fallback_chain[self._fallback_index] - self._fallback_index += 1 - fb_provider = (fb.get("provider") or "").strip().lower() - fb_model = (fb.get("model") or "").strip() - if not fb_provider or not fb_model: - return self._try_activate_fallback() # skip invalid, try next - - # Skip entries that resolve to the current (provider, model) — falling - # back to the same backend that just failed loops the failure. Compare - # base_url too so two distinct custom_providers entries pointing at the - # same shim/proxy URL also dedup. See issue #22548. - current_provider = (getattr(self, "provider", "") or "").strip().lower() - current_model = (getattr(self, "model", "") or "").strip() - current_base_url = str(getattr(self, "base_url", "") or "").rstrip("/").lower() - fb_base_url_for_dedup = (fb.get("base_url") or "").strip().rstrip("/").lower() - if fb_provider == current_provider and fb_model == current_model: - logging.warning( - "Fallback skip: chain entry %s/%s matches current provider/model", - fb_provider, fb_model, - ) - return self._try_activate_fallback() - if ( - fb_base_url_for_dedup - and current_base_url - and fb_base_url_for_dedup == current_base_url - and fb_model == current_model - ): - logging.warning( - "Fallback skip: chain entry base_url %s matches current backend", - fb_base_url_for_dedup, - ) - return self._try_activate_fallback() - - # Use centralized router for client construction. - # raw_codex=True because the main agent needs direct responses.stream() - # access for Codex providers. - try: - from agent.auxiliary_client import resolve_provider_client - # Pass base_url and api_key from fallback config so custom - # endpoints (e.g. Ollama Cloud) resolve correctly instead of - # falling through to OpenRouter defaults. - fb_base_url_hint = (fb.get("base_url") or "").strip() or None - fb_api_key_hint = (fb.get("api_key") or "").strip() or None - if not fb_api_key_hint: - # key_env and api_key_env are both documented aliases (see - # _normalize_custom_provider_entry in hermes_cli/config.py). - fb_key_env = (fb.get("key_env") or fb.get("api_key_env") or "").strip() - if fb_key_env: - fb_api_key_hint = os.getenv(fb_key_env, "").strip() or None - # For Ollama Cloud endpoints, pull OLLAMA_API_KEY from env - # when no explicit key is in the fallback config. Host match - # (not substring) — see GHSA-76xc-57q6-vm5m. - if fb_base_url_hint and base_url_host_matches(fb_base_url_hint, "ollama.com") and not fb_api_key_hint: - fb_api_key_hint = os.getenv("OLLAMA_API_KEY") or None - fb_client, _resolved_fb_model = resolve_provider_client( - fb_provider, model=fb_model, raw_codex=True, - explicit_base_url=fb_base_url_hint, - explicit_api_key=fb_api_key_hint) - if fb_client is None: - logging.warning( - "Fallback to %s failed: provider not configured", - fb_provider) - return self._try_activate_fallback() # try next in chain - try: - from hermes_cli.model_normalize import normalize_model_for_provider - - fb_model = normalize_model_for_provider(fb_model, fb_provider) - except Exception: - pass - - # Determine api_mode from provider / base URL / model - fb_api_mode = "chat_completions" - fb_base_url = str(fb_client.base_url) - _fb_is_azure = self._is_azure_openai_url(fb_base_url) - if fb_provider == "openai-codex": - fb_api_mode = "codex_responses" - elif fb_provider == "anthropic" or fb_base_url.rstrip("/").lower().endswith("/anthropic"): - fb_api_mode = "anthropic_messages" - elif _fb_is_azure: - # Azure OpenAI serves gpt-5.x on /chat/completions — does NOT - # support the Responses API. Stay on chat_completions. - fb_api_mode = "chat_completions" - elif self._is_direct_openai_url(fb_base_url): - fb_api_mode = "codex_responses" - elif self._provider_model_requires_responses_api( - fb_model, - provider=fb_provider, - ): - # GPT-5.x models usually need Responses API, but keep - # provider-specific exceptions like Copilot gpt-5-mini on - # chat completions. - fb_api_mode = "codex_responses" - elif fb_provider == "bedrock" or ( - base_url_hostname(fb_base_url).startswith("bedrock-runtime.") - and base_url_host_matches(fb_base_url, "amazonaws.com") - ): - fb_api_mode = "bedrock_converse" - - old_model = self.model - - # Clear the per-config context_length override so the fallback - # model's actual context window is resolved instead of inheriting - # the stale value from the previous model. See #22387. - self._config_context_length = None - self.model = fb_model - self.provider = fb_provider - self.base_url = fb_base_url - self.api_mode = fb_api_mode - if hasattr(self, "_transport_cache"): - self._transport_cache.clear() - self._fallback_activated = True - - # Honor per-provider / per-model request_timeout_seconds for the - # fallback target (same knob the primary client uses). None = use - # SDK default. - _fb_timeout = get_provider_request_timeout(fb_provider, fb_model) - - if fb_api_mode == "anthropic_messages": - # Build native Anthropic client instead of using OpenAI client - from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token, _is_oauth_token - effective_key = (fb_client.api_key or resolve_anthropic_token() or "") if fb_provider == "anthropic" else (fb_client.api_key or "") - self.api_key = effective_key - self._anthropic_api_key = effective_key - self._anthropic_base_url = fb_base_url - self._anthropic_client = build_anthropic_client( - effective_key, self._anthropic_base_url, timeout=_fb_timeout, - ) - self._is_anthropic_oauth = _is_oauth_token(effective_key) if fb_provider == "anthropic" else False - self.client = None - self._client_kwargs = {} - else: - # Swap OpenAI client and config in-place - self.api_key = fb_client.api_key - self.client = fb_client - # Preserve provider-specific headers that - # resolve_provider_client() may have baked into - # fb_client via the default_headers kwarg. The OpenAI - # SDK stores these in _custom_headers. Without this, - # subsequent request-client rebuilds (via - # _create_request_openai_client) drop the headers, - # causing 403s from providers like Kimi Coding that - # require a User-Agent sentinel. - fb_headers = getattr(fb_client, "_custom_headers", None) - if not fb_headers: - fb_headers = getattr(fb_client, "default_headers", None) - self._client_kwargs = { - "api_key": fb_client.api_key, - "base_url": fb_base_url, - **({"default_headers": dict(fb_headers)} if fb_headers else {}), - } - if _fb_timeout is not None: - self._client_kwargs["timeout"] = _fb_timeout - # Rebuild the shared OpenAI client so the configured - # timeout takes effect on the very next fallback request, - # not only after a later credential-rotation rebuild. - self._replace_primary_openai_client(reason="fallback_timeout_apply") - - # Re-evaluate prompt caching for the new provider/model - self._use_prompt_caching, self._use_native_cache_layout = ( - self._anthropic_prompt_cache_policy( - provider=fb_provider, - base_url=fb_base_url, - api_mode=fb_api_mode, - model=fb_model, - ) - ) - - # LM Studio: preload before probing the fallback's context length. - self._ensure_lmstudio_runtime_loaded() - - # Update context compressor limits for the fallback model. - # Without this, compression decisions use the primary model's - # context window (e.g. 200K) instead of the fallback's (e.g. 32K), - # causing oversized sessions to overflow the fallback. - # Also pass _config_context_length so the explicit config override - # (model.context_length in config.yaml) is respected — without this, - # the fallback activation drops to 128K even when config says 204800. - if hasattr(self, 'context_compressor') and self.context_compressor: - from agent.model_metadata import get_model_context_length - fb_context_length = get_model_context_length( - self.model, base_url=self.base_url, - api_key=self.api_key, provider=self.provider, - config_context_length=getattr(self, "_config_context_length", None), - custom_providers=self._custom_providers, - ) - self.context_compressor.update_model( - model=self.model, - context_length=fb_context_length, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - provider=self.provider, - ) - - self._emit_status( - f"🔄 Primary model failed — switching to fallback: " - f"{fb_model} via {fb_provider}" - ) - logging.info( - "Fallback activated: %s → %s (%s)", - old_model, fb_model, fb_provider, - ) - return True - except Exception as e: - logging.error("Failed to activate fallback %s: %s", fb_model, e) - return self._try_activate_fallback() # try next in chain + """Forwarder — see ``agent.chat_completion_helpers.try_activate_fallback``.""" + from agent.chat_completion_helpers import try_activate_fallback + return try_activate_fallback(self, reason) # ── Per-turn primary restoration ───────────────────────────────────── def _restore_primary_runtime(self) -> bool: - """Restore the primary runtime at the start of a new turn. - - In long-lived CLI sessions a single AIAgent instance spans multiple - turns. Without restoration, one transient failure pins the session - to the fallback provider for every subsequent turn. Calling this at - the top of ``run_conversation()`` makes fallback turn-scoped. - - The gateway caches agents across messages (``_agent_cache`` in - ``gateway/run.py``), so this restoration IS needed there too. - """ - if not self._fallback_activated: - # Reset the chain index even when no fallback was activated this - # turn. Without this, a turn where _try_activate_fallback() was - # called but returned False (chain exhausted or provider not - # configured) leaves _fallback_index >= len(_fallback_chain) while - # _fallback_activated stays False. The next turn skips this block - # entirely, stranding the index and silently blocking all future - # fallback attempts for the session. Fixes #20465. - self._fallback_index = 0 - return False - - if getattr(self, "_rate_limited_until", 0) > time.monotonic(): - return False # primary still in rate-limit cooldown, stay on fallback - - rt = self._primary_runtime - try: - # ── Core runtime state ── - self.model = rt["model"] - self.provider = rt["provider"] - self.base_url = rt["base_url"] # setter updates _base_url_lower - self.api_mode = rt["api_mode"] - if hasattr(self, "_transport_cache"): - self._transport_cache.clear() - self.api_key = rt["api_key"] - self._client_kwargs = dict(rt["client_kwargs"]) - self._use_prompt_caching = rt["use_prompt_caching"] - # Default to native layout when the restored snapshot predates the - # native-vs-proxy split (older sessions saved before this PR). - self._use_native_cache_layout = rt.get( - "use_native_cache_layout", - self.api_mode == "anthropic_messages" and self.provider == "anthropic", - ) - - # ── Rebuild client for the primary provider ── - if self.api_mode == "anthropic_messages": - from agent.anthropic_adapter import build_anthropic_client - self._anthropic_api_key = rt["anthropic_api_key"] - self._anthropic_base_url = rt["anthropic_base_url"] - self._anthropic_client = build_anthropic_client( - rt["anthropic_api_key"], rt["anthropic_base_url"], - timeout=get_provider_request_timeout(self.provider, self.model), - ) - self._is_anthropic_oauth = rt["is_anthropic_oauth"] - self.client = None - else: - self.client = self._create_openai_client( - dict(rt["client_kwargs"]), - reason="restore_primary", - shared=True, - ) - - # ── Restore context engine state ── - cc = self.context_compressor - cc.update_model( - model=rt["compressor_model"], - context_length=rt["compressor_context_length"], - base_url=rt["compressor_base_url"], - api_key=rt["compressor_api_key"], - provider=rt["compressor_provider"], - ) - - # ── Reset fallback chain for the new turn ── - self._fallback_activated = False - self._fallback_index = 0 - - logging.info( - "Primary runtime restored for new turn: %s (%s)", - self.model, self.provider, - ) - return True - except Exception as e: - logging.warning("Failed to restore primary runtime: %s", e) - return False - - # Which error types indicate a transient transport failure worth - # one more attempt with a rebuilt client / connection pool. - _TRANSIENT_TRANSPORT_ERRORS = frozenset({ - "ReadTimeout", "ConnectTimeout", "PoolTimeout", - "ConnectError", "RemoteProtocolError", - "APIConnectionError", "APITimeoutError", - }) + """Forwarder — see ``agent.agent_runtime_helpers.restore_primary_runtime``.""" + from agent.agent_runtime_helpers import restore_primary_runtime + return restore_primary_runtime(self) def _try_recover_primary_transport( self, api_error: Exception, *, retry_count: int, max_retries: int, ) -> bool: - """Attempt one extra primary-provider recovery cycle for transient transport failures. - - After ``max_retries`` exhaust, rebuild the primary client (clearing - stale connection pools) and give it one more attempt before falling - back. This is most useful for direct endpoints (custom, Z.AI, - Anthropic, OpenAI, local models) where a TCP-level hiccup does not - mean the provider is down. - - Skipped for proxy/aggregator providers (OpenRouter, Nous) which - already manage connection pools and retries server-side — if our - retries through them are exhausted, one more rebuilt client won't help. - """ - if self._fallback_activated: - return False - - # Only for transient transport errors - error_type = type(api_error).__name__ - if error_type not in self._TRANSIENT_TRANSPORT_ERRORS: - return False - - # Skip for aggregator providers — they manage their own retry infra - if self._is_openrouter_url(): - return False - provider_lower = (self.provider or "").strip().lower() - if provider_lower in {"nous", "nous-research"}: - return False - - try: - # Close existing client to release stale connections - if getattr(self, "client", None) is not None: - try: - self._close_openai_client( - self.client, reason="primary_recovery", shared=True, - ) - except Exception: - pass - - # Rebuild from primary snapshot - rt = self._primary_runtime - self._client_kwargs = dict(rt["client_kwargs"]) - self.model = rt["model"] - self.provider = rt["provider"] - self.base_url = rt["base_url"] - self.api_mode = rt["api_mode"] - if hasattr(self, "_transport_cache"): - self._transport_cache.clear() - self.api_key = rt["api_key"] - - if self.api_mode == "anthropic_messages": - from agent.anthropic_adapter import build_anthropic_client - self._anthropic_api_key = rt["anthropic_api_key"] - self._anthropic_base_url = rt["anthropic_base_url"] - self._anthropic_client = build_anthropic_client( - rt["anthropic_api_key"], rt["anthropic_base_url"], - timeout=get_provider_request_timeout(self.provider, self.model), - ) - self._is_anthropic_oauth = rt["is_anthropic_oauth"] - self.client = None - else: - self.client = self._create_openai_client( - dict(rt["client_kwargs"]), - reason="primary_recovery", - shared=True, - ) - - wait_time = min(3 + retry_count, 8) - self._vprint( - f"{self.log_prefix}🔁 Transient {error_type} on {self.provider} — " - f"rebuilt client, waiting {wait_time}s before one last primary attempt.", - force=True, - ) - time.sleep(wait_time) - return True - except Exception as e: - logging.warning("Primary transport recovery failed: %s", e) - return False - - # ── End provider fallback ────────────────────────────────────────────── + """Forwarder — see ``agent.agent_runtime_helpers.try_recover_primary_transport``.""" + from agent.agent_runtime_helpers import try_recover_primary_transport + return try_recover_primary_transport(self, api_error, retry_count=retry_count, max_retries=max_retries) @staticmethod def _content_has_image_parts(content: Any) -> bool: @@ -9676,116 +3345,9 @@ class AIAgent: return summary def _try_shrink_image_parts_in_messages(self, api_messages: list) -> bool: - """Re-encode all native image parts at a smaller size to recover from - image-too-large errors (Anthropic 5 MB, unknown other providers). - - Mutates ``api_messages`` in place. Returns True if any image part was - actually replaced, False if there were no image parts to shrink or - Pillow couldn't help (caller should surface the original error). - - Strategy: look for ``image_url`` / ``input_image`` parts carrying a - ``data:image/...;base64,...`` payload. For each one whose encoded - size exceeds 4 MB (a safe target that slides under Anthropic's 5 MB - ceiling with header overhead), write the base64 to a tempfile, call - ``vision_tools._resize_image_for_vision`` to produce a smaller data - URL, and substitute it in place. - - Non-data-URL images (http/https URLs) are not touched — the provider - fetches those itself and the size limit is different. - """ - if not api_messages: - return False - - try: - from tools.vision_tools import _resize_image_for_vision - except Exception as exc: - logger.warning("image-shrink recovery: vision_tools unavailable — %s", exc) - return False - - # 4 MB target leaves comfortable headroom under Anthropic's 5 MB. - # Non-Anthropic providers we haven't observed rejecting are fine with - # much larger; shrinking to 4 MB here loses quality but only fires - # after a confirmed provider rejection, so the alternative is failure. - target_bytes = 4 * 1024 * 1024 - changed_count = 0 - - def _shrink_data_url(url: str) -> Optional[str]: - """Return a smaller data URL, or None if shrink can't help.""" - if not isinstance(url, str) or not url.startswith("data:"): - return None - if len(url) <= target_bytes: - # This specific image wasn't the oversized one. - return None - try: - header, _, data = url.partition(",") - mime = "image/jpeg" - if header.startswith("data:"): - mime_part = header[len("data:"):].split(";", 1)[0].strip() - if mime_part.startswith("image/"): - mime = mime_part - import base64 as _b64 - raw = _b64.b64decode(data) - suffix = { - "image/png": ".png", "image/gif": ".gif", "image/webp": ".webp", - "image/jpeg": ".jpg", "image/jpg": ".jpg", "image/bmp": ".bmp", - }.get(mime, ".jpg") - tmp = tempfile.NamedTemporaryFile( - prefix="hermes_shrink_", suffix=suffix, delete=False, - ) - try: - tmp.write(raw) - tmp.close() - resized = _resize_image_for_vision( - Path(tmp.name), - mime_type=mime, - max_base64_bytes=target_bytes, - ) - finally: - try: - Path(tmp.name).unlink(missing_ok=True) - except Exception: - pass - if not resized or len(resized) >= len(url): - # Shrink didn't help (or made it bigger — corrupt input?). - return None - return resized - except Exception as exc: - logger.warning("image-shrink recovery: re-encode failed — %s", exc) - return None - - for msg in api_messages: - if not isinstance(msg, dict): - continue - content = msg.get("content") - if not isinstance(content, list): - continue - for part in content: - if not isinstance(part, dict): - continue - ptype = part.get("type") - if ptype not in {"image_url", "input_image"}: - continue - image_value = part.get("image_url") - # OpenAI chat.completions: {"image_url": {"url": "data:..."}} - # OpenAI Responses: {"image_url": "data:..."} - if isinstance(image_value, dict): - url = image_value.get("url", "") - resized = _shrink_data_url(url) - if resized: - image_value["url"] = resized - changed_count += 1 - elif isinstance(image_value, str): - resized = _shrink_data_url(image_value) - if resized: - part["image_url"] = resized - changed_count += 1 - - if changed_count: - logger.info( - "image-shrink recovery: re-encoded %d image part(s) to fit under %.0f MB", - changed_count, target_bytes / (1024 * 1024), - ) - return changed_count > 0 + """Forwarder — see ``agent.conversation_compression.try_shrink_image_parts_in_messages``.""" + from agent.conversation_compression import try_shrink_image_parts_in_messages + return try_shrink_image_parts_in_messages(api_messages) def _anthropic_preserve_dots(self) -> bool: """True when using an anthropic-compatible endpoint that preserves dots in model names. @@ -9887,225 +3449,9 @@ class AIAgent: break def _build_api_kwargs(self, api_messages: list) -> dict: - """Build the keyword arguments dict for the active API mode.""" - tools_for_api = self.tools - - if self.api_mode == "anthropic_messages": - _transport = self._get_transport() - anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages) - ctx_len = getattr(self, "context_compressor", None) - ctx_len = ctx_len.context_length if ctx_len else None - ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None) - if ephemeral_out is not None: - self._ephemeral_max_output_tokens = None # consume immediately - return _transport.build_kwargs( - model=self.model, - messages=anthropic_messages, - tools=tools_for_api, - max_tokens=ephemeral_out if ephemeral_out is not None else self.max_tokens, - reasoning_config=self.reasoning_config, - is_oauth=self._is_anthropic_oauth, - preserve_dots=self._anthropic_preserve_dots(), - context_length=ctx_len, - base_url=getattr(self, "_anthropic_base_url", None), - fast_mode=(self.request_overrides or {}).get("speed") == "fast", - drop_context_1m_beta=bool(getattr(self, "_oauth_1m_beta_disabled", False)), - ) - - # AWS Bedrock native Converse API — bypasses the OpenAI client entirely. - # The adapter handles message/tool conversion and boto3 calls directly. - if self.api_mode == "bedrock_converse": - _bt = self._get_transport() - region = getattr(self, "_bedrock_region", None) or "us-east-1" - guardrail = getattr(self, "_bedrock_guardrail_config", None) - return _bt.build_kwargs( - model=self.model, - messages=api_messages, - tools=tools_for_api, - max_tokens=self.max_tokens or 4096, - region=region, - guardrail_config=guardrail, - ) - - if self.api_mode == "codex_responses": - _ct = self._get_transport() - is_github_responses = ( - base_url_host_matches(self.base_url, "models.github.ai") - or base_url_host_matches(self.base_url, "api.githubcopilot.com") - ) - is_codex_backend = ( - self.provider == "openai-codex" - or ( - self._base_url_hostname == "chatgpt.com" - and "/backend-api/codex" in self._base_url_lower - ) - ) - is_xai_responses = self.provider in {"xai", "xai-oauth"} or self._base_url_hostname == "api.x.ai" - _msgs_for_codex = self._prepare_messages_for_non_vision_model(api_messages) - return _ct.build_kwargs( - model=self.model, - messages=_msgs_for_codex, - tools=tools_for_api, - reasoning_config=self.reasoning_config, - session_id=getattr(self, "session_id", None), - max_tokens=self.max_tokens, - request_overrides=self.request_overrides, - is_github_responses=is_github_responses, - is_codex_backend=is_codex_backend, - is_xai_responses=is_xai_responses, - github_reasoning_extra=self._github_models_reasoning_extra_body() if is_github_responses else None, - ) - - # ── chat_completions (default) ───────────────────────────────────── - _ct = self._get_transport() - - # Provider detection flags - _is_qwen = self._is_qwen_portal() - _is_or = self._is_openrouter_url() - _is_gh = ( - base_url_host_matches(self._base_url_lower, "models.github.ai") - or base_url_host_matches(self._base_url_lower, "api.githubcopilot.com") - ) - _is_nous = "nousresearch" in self._base_url_lower - _is_nvidia = "integrate.api.nvidia.com" in self._base_url_lower - _is_kimi = ( - base_url_host_matches(self.base_url, "api.kimi.com") - or base_url_host_matches(self.base_url, "moonshot.ai") - or base_url_host_matches(self.base_url, "moonshot.cn") - ) - _is_tokenhub = base_url_host_matches(self._base_url_lower, "tokenhub.tencentmaas.com") - _is_lmstudio = (self.provider or "").strip().lower() == "lmstudio" - - # Temperature: _fixed_temperature_for_model may return OMIT_TEMPERATURE - # sentinel (temperature omitted entirely), a numeric override, or None. - try: - from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE - _ft = _fixed_temperature_for_model(self.model, self.base_url) - _omit_temp = _ft is OMIT_TEMPERATURE - _fixed_temp = _ft if not _omit_temp else None - except Exception: - _omit_temp = False - _fixed_temp = None - - # Provider preferences (OpenRouter-style) - _prefs: Dict[str, Any] = {} - if self.providers_allowed: - _prefs["only"] = self.providers_allowed - if self.providers_ignored: - _prefs["ignore"] = self.providers_ignored - if self.providers_order: - _prefs["order"] = self.providers_order - if self.provider_sort: - _prefs["sort"] = self.provider_sort - if self.provider_require_parameters: - _prefs["require_parameters"] = True - if self.provider_data_collection: - _prefs["data_collection"] = self.provider_data_collection - - # Claude max-output override on aggregators - _ant_max = None - if (_is_or or _is_nous) and "claude" in (self.model or "").lower(): - try: - from agent.anthropic_adapter import _get_anthropic_max_output - _ant_max = _get_anthropic_max_output(self.model) - except Exception: - pass - - # Qwen session metadata - _qwen_meta = None - if _is_qwen: - _qwen_meta = { - "sessionId": self.session_id or "hermes", - "promptId": str(uuid.uuid4()), - } - - # ── Provider profile path (registered providers) ─────────────────── - # Profiles handle per-provider quirks via hooks. When a profile is - # found, delegate fully; otherwise fall through to the legacy flag path. - try: - from providers import get_provider_profile - _profile = get_provider_profile(self.provider) - except Exception: - _profile = None - - if _profile: - _ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None) - if _ephemeral_out is not None: - self._ephemeral_max_output_tokens = None - - # Strip image parts for non-vision models that have provider profiles - # (e.g. DeepSeek, Kimi). The legacy path below already does this, but - # registered providers with profiles were bypassing the strip. - api_messages = self._prepare_messages_for_non_vision_model(api_messages) - - return _ct.build_kwargs( - model=self.model, - messages=api_messages, - tools=tools_for_api, - base_url=self.base_url, - timeout=self._resolved_api_call_timeout(), - max_tokens=self.max_tokens, - ephemeral_max_output_tokens=_ephemeral_out, - max_tokens_param_fn=self._max_tokens_param, - reasoning_config=self.reasoning_config, - request_overrides=self.request_overrides, - session_id=getattr(self, "session_id", None), - provider_profile=_profile, - ollama_num_ctx=self._ollama_num_ctx, - # Context forwarded to profile hooks: - provider_preferences=_prefs or None, - openrouter_min_coding_score=self.openrouter_min_coding_score, - anthropic_max_output=_ant_max, - supports_reasoning=self._supports_reasoning_extra_body(), - qwen_session_metadata=_qwen_meta, - ) - - # ── Legacy flag path ──────────────────────────────────────────── - # Reached only when get_provider_profile() returns None — i.e. a - # completely unknown provider not in providers/ registry. - _ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None) - if _ephemeral_out is not None: - self._ephemeral_max_output_tokens = None - - # Strip image parts for non-vision models (no-op when vision-capable). - _msgs_for_chat = self._prepare_messages_for_non_vision_model(api_messages) - - return _ct.build_kwargs( - model=self.model, - messages=_msgs_for_chat, - tools=tools_for_api, - base_url=self.base_url, - timeout=self._resolved_api_call_timeout(), - max_tokens=self.max_tokens, - ephemeral_max_output_tokens=_ephemeral_out, - max_tokens_param_fn=self._max_tokens_param, - reasoning_config=self.reasoning_config, - request_overrides=self.request_overrides, - session_id=getattr(self, "session_id", None), - model_lower=(self.model or "").lower(), - is_openrouter=_is_or, - is_nous=_is_nous, - is_qwen_portal=_is_qwen, - is_github_models=_is_gh, - is_nvidia_nim=_is_nvidia, - is_kimi=_is_kimi, - is_tokenhub=_is_tokenhub, - is_lmstudio=_is_lmstudio, - is_custom_provider=self.provider == "custom", - ollama_num_ctx=self._ollama_num_ctx, - provider_preferences=_prefs or None, - openrouter_min_coding_score=self.openrouter_min_coding_score, - qwen_prepare_fn=self._qwen_prepare_chat_messages if _is_qwen else None, - qwen_prepare_inplace_fn=self._qwen_prepare_chat_messages_inplace if _is_qwen else None, - qwen_session_metadata=_qwen_meta, - fixed_temperature=_fixed_temp, - omit_temperature=_omit_temp, - supports_reasoning=self._supports_reasoning_extra_body(), - github_reasoning_extra=self._github_models_reasoning_extra_body() if _is_gh else None, - lmstudio_reasoning_options=self._lmstudio_reasoning_options_cached() if _is_lmstudio else None, - anthropic_max_output=_ant_max, - provider_name=self.provider, - ) + """Forwarder — see ``agent.chat_completion_helpers.build_api_kwargs``.""" + from agent.chat_completion_helpers import build_api_kwargs + return build_api_kwargs(self, api_messages) def _supports_reasoning_extra_body(self) -> bool: """Return True when reasoning extra_body is safe to send for this route/model. @@ -10231,197 +3577,9 @@ class AIAgent: return {"effort": requested_effort} def _build_assistant_message(self, assistant_message, finish_reason: str) -> dict: - """Build a normalized assistant message dict from an API response message. - - Handles reasoning extraction, reasoning_details, and optional tool_calls - so both the tool-call path and the final-response path share one builder. - """ - assistant_tool_calls = getattr(assistant_message, "tool_calls", None) - reasoning_text = self._extract_reasoning(assistant_message) - _from_structured = bool(reasoning_text) - - # Fallback: extract inline blocks from content when no structured - # reasoning fields are present (some models/providers embed thinking - # directly in the content rather than returning separate API fields). - if not reasoning_text: - content = assistant_message.content or "" - think_blocks = re.findall(r'(.*?)', content, flags=re.DOTALL) - if think_blocks: - combined = "\n\n".join(b.strip() for b in think_blocks if b.strip()) - reasoning_text = combined or None - - if reasoning_text and self.verbose_logging: - logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {reasoning_text}") - - if reasoning_text and self.reasoning_callback: - # Skip callback when streaming is active — reasoning was already - # displayed during the stream via one of two paths: - # (a) _fire_reasoning_delta (structured reasoning_content deltas) - # (b) _stream_delta tag extraction (/) - # When streaming is NOT active, always fire so non-streaming modes - # (gateway, batch, quiet) still get reasoning. - # Any reasoning that wasn't shown during streaming is caught by the - # CLI post-response display fallback (cli.py _reasoning_shown_this_turn). - if not self.stream_delta_callback and not self._stream_callback: - try: - self.reasoning_callback(reasoning_text) - except Exception: - pass - - # Sanitize surrogates from API response — some models (e.g. Kimi/GLM via Ollama) - # can return invalid surrogate code points that crash json.dumps() on persist. - _raw_content = assistant_message.content or "" - _san_content = _sanitize_surrogates(_raw_content) - if reasoning_text: - reasoning_text = _sanitize_surrogates(reasoning_text) - - # Strip inline reasoning tags ( etc.) from the stored - # assistant content. Reasoning was already captured into - # ``reasoning_text`` above (either from structured fields or the - # inline-block fallback), so the raw tags in content are redundant. - # Leaving them in place caused reasoning to leak to messaging - # platforms (#8878, #9568), inflate context on subsequent turns - # (#9306 observed 16% content-size reduction on a real MiniMax - # session), and pollute generated session titles. One strip at the - # storage boundary cleans content for every downstream consumer: - # API replay, session transcript, gateway delivery, CLI display, - # compression, title generation. - if isinstance(_san_content, str) and _san_content: - _san_content = self._strip_think_blocks(_san_content).strip() - - msg = { - "role": "assistant", - "content": _san_content, - "reasoning": reasoning_text, - "finish_reason": finish_reason, - } - - raw_reasoning_content = getattr(assistant_message, "reasoning_content", None) - if raw_reasoning_content is None and hasattr(assistant_message, "model_extra"): - model_extra = getattr(assistant_message, "model_extra", None) or {} - if isinstance(model_extra, dict) and "reasoning_content" in model_extra: - raw_reasoning_content = model_extra["reasoning_content"] - if raw_reasoning_content is not None: - msg["reasoning_content"] = _sanitize_surrogates(raw_reasoning_content) - elif assistant_tool_calls and self._needs_thinking_reasoning_pad(): - # DeepSeek v4 thinking mode and Kimi / Moonshot thinking mode - # both require reasoning_content on every assistant tool-call - # message. Without it, replaying the persisted message causes - # HTTP 400 ("The reasoning_content in the thinking mode must - # be passed back to the API"). Include streamed reasoning - # text when captured; otherwise pad with a single space — - # DeepSeek V4 Pro tightened validation and rejects empty - # string ("The reasoning content in the thinking mode must - # be passed back to the API"). A space satisfies non-empty - # checks everywhere without leaking fabricated reasoning. - # Refs #15250, #17400, #17341. - msg["reasoning_content"] = reasoning_text or " " - - # Additive fallback (refs #16844, #16884). Streaming-only providers - # (glm, MiniMax, gpt-5.x via aigw, Anthropic via openai-compat shims) - # accumulate reasoning through ``delta.reasoning_content`` chunks - # but never land it on the message object as a top-level attribute, - # so neither branch above fires and the chain-of-thought is stored - # only under the internal ``reasoning`` key. When the user later - # replays that history through a DeepSeek-v4 / Kimi thinking model, - # the missing ``reasoning_content`` causes HTTP 400 ("The - # reasoning_content in the thinking mode must be passed back to the - # API."). - # - # Promote the already-sanitized streamed ``reasoning_text`` to - # ``reasoning_content`` at write time, but ONLY when no prior branch - # already set it AND we actually captured reasoning text. This - # preserves every existing behavior: - # - SDK-exposed ``reasoning_content`` (OpenAI/Moonshot/DeepSeek SDK) - # still wins. - # - DeepSeek tool-call ""-pad (#15250) still fires. - # - Non-thinking turns with no reasoning leave the field absent, - # so ``_copy_reasoning_content_for_api``'s cross-provider leak - # guard (#15748) and ``reasoning``→``reasoning_content`` - # promotion tiers still apply at replay time. - if "reasoning_content" not in msg and reasoning_text: - msg["reasoning_content"] = reasoning_text - - if hasattr(assistant_message, 'reasoning_details') and assistant_message.reasoning_details: - # Pass reasoning_details back unmodified so providers (OpenRouter, - # Anthropic, OpenAI) can maintain reasoning continuity across turns. - # Each provider may include opaque fields (signature, encrypted_content) - # that must be preserved exactly. - raw_details = assistant_message.reasoning_details - preserved = [] - for d in raw_details: - if isinstance(d, dict): - preserved.append(d) - elif hasattr(d, "__dict__"): - preserved.append(d.__dict__) - elif hasattr(d, "model_dump"): - preserved.append(d.model_dump()) - if preserved: - msg["reasoning_details"] = preserved - - # Codex Responses API: preserve encrypted reasoning items for - # multi-turn continuity. These get replayed as input on the next turn. - codex_items = getattr(assistant_message, "codex_reasoning_items", None) - if codex_items: - msg["codex_reasoning_items"] = codex_items - - # Codex Responses API: preserve exact assistant message items (with - # id/phase) so follow-up turns can replay structured items instead of - # flattening to plain text. This is required for prefix cache hits. - codex_message_items = getattr(assistant_message, "codex_message_items", None) - if codex_message_items: - msg["codex_message_items"] = codex_message_items - - if assistant_tool_calls: - tool_calls = [] - for tool_call in assistant_tool_calls: - raw_id = getattr(tool_call, "id", None) - call_id = getattr(tool_call, "call_id", None) - if not isinstance(call_id, str) or not call_id.strip(): - embedded_call_id, _ = self._split_responses_tool_id(raw_id) - call_id = embedded_call_id - if not isinstance(call_id, str) or not call_id.strip(): - if isinstance(raw_id, str) and raw_id.strip(): - call_id = raw_id.strip() - else: - _fn = getattr(tool_call, "function", None) - _fn_name = getattr(_fn, "name", "") if _fn else "" - _fn_args = getattr(_fn, "arguments", "{}") if _fn else "{}" - call_id = self._deterministic_call_id(_fn_name, _fn_args, len(tool_calls)) - call_id = call_id.strip() - - response_item_id = getattr(tool_call, "response_item_id", None) - if not isinstance(response_item_id, str) or not response_item_id.strip(): - _, embedded_response_item_id = self._split_responses_tool_id(raw_id) - response_item_id = embedded_response_item_id - - response_item_id = self._derive_responses_function_call_id( - call_id, - response_item_id if isinstance(response_item_id, str) else None, - ) - - tc_dict = { - "id": call_id, - "call_id": call_id, - "response_item_id": response_item_id, - "type": tool_call.type, - "function": { - "name": tool_call.function.name, - "arguments": tool_call.function.arguments - }, - } - # Preserve extra_content (e.g. Gemini thought_signature) so it - # is sent back on subsequent API calls. Without this, Gemini 3 - # thinking models reject the request with a 400 error. - extra = getattr(tool_call, "extra_content", None) - if extra is not None: - if hasattr(extra, "model_dump"): - extra = extra.model_dump() - tc_dict["extra_content"] = extra - tool_calls.append(tc_dict) - msg["tool_calls"] = tool_calls - - return msg + """Forwarder — see ``agent.chat_completion_helpers.build_assistant_message``.""" + from agent.chat_completion_helpers import build_assistant_message + return build_assistant_message(self, assistant_message, finish_reason) def _needs_thinking_reasoning_pad(self) -> bool: """Return True when the active provider enforces reasoning_content echo-back. @@ -10486,74 +3644,9 @@ class AIAgent: ) def _copy_reasoning_content_for_api(self, source_msg: dict, api_msg: dict) -> None: - """Copy provider-facing reasoning fields onto an API replay message.""" - if source_msg.get("role") != "assistant": - return - - # 1. Explicit reasoning_content already set — preserve it verbatim - # (includes DeepSeek/Kimi's own space-placeholder written at creation - # time, and any valid reasoning content from the same provider). - # - # Exception: sessions persisted BEFORE #17341 have empty-string - # placeholders pinned at creation time. DeepSeek V4 Pro rejects - # those with HTTP 400. When the active provider enforces the - # thinking-mode echo, upgrade "" → " " on replay so stale history - # doesn't 400 the user on the next turn. - existing = source_msg.get("reasoning_content") - if isinstance(existing, str): - if existing == "" and self._needs_thinking_reasoning_pad(): - api_msg["reasoning_content"] = " " - else: - api_msg["reasoning_content"] = existing - return - - needs_thinking_pad = self._needs_thinking_reasoning_pad() - - # 2. Cross-provider poisoned history (#15748): on DeepSeek/Kimi, - # if the source turn has tool_calls AND a 'reasoning' field but no - # 'reasoning_content' key, the 'reasoning' text was written by a - # prior provider (e.g. MiniMax) — DeepSeek's own _build_assistant_message - # pins reasoning_content at creation time for tool-call turns, so the - # shape (reasoning set, reasoning_content absent, tool_calls present) - # is unreachable from same-provider DeepSeek history after this fix. - # Inject a single space to satisfy the API without leaking another - # provider's chain of thought to DeepSeek/Kimi. Space (not "") - # because DeepSeek V4 Pro rejects empty-string reasoning_content - # in thinking mode (refs #17341). - normalized_reasoning = source_msg.get("reasoning") - if ( - needs_thinking_pad - and source_msg.get("tool_calls") - and isinstance(normalized_reasoning, str) - and normalized_reasoning - ): - api_msg["reasoning_content"] = " " - return - - # 3. Healthy session: promote 'reasoning' field to 'reasoning_content' - # for providers that use the internal 'reasoning' key. - # This must happen before the unconditional empty-string fallback so - # genuine reasoning content is not overwritten (#15812 regression in - # PR #15478). - if isinstance(normalized_reasoning, str) and normalized_reasoning: - api_msg["reasoning_content"] = normalized_reasoning - return - - # 4. DeepSeek / Kimi thinking mode: all assistant messages need - # reasoning_content. Inject a single space to satisfy the provider's - # requirement when no explicit reasoning content is present. Covers - # both tool-call turns (already-poisoned history with no reasoning - # at all) and plain text turns. Space (not "") because DeepSeek V4 - # Pro tightened validation and rejects empty string with HTTP 400 - # ("The reasoning content in the thinking mode must be passed back - # to the API"). Refs #17341. - if needs_thinking_pad: - api_msg["reasoning_content"] = " " - return - - # 5. reasoning_content was present but not a string (e.g. None after - # context compaction). Don't pass null to the API. - api_msg.pop("reasoning_content", None) + """Forwarder — see ``agent.agent_runtime_helpers.copy_reasoning_content_for_api``.""" + from agent.agent_runtime_helpers import copy_reasoning_content_for_api + return copy_reasoning_content_for_api(self, source_msg, api_msg) @staticmethod def _sanitize_tool_calls_for_strict_api(api_msg: dict) -> dict: @@ -10590,108 +3683,9 @@ class AIAgent: logger=None, session_id: str = None, ) -> int: - """Repair corrupted assistant tool-call argument JSON in-place.""" - log = logger or logging.getLogger(__name__) - if not isinstance(messages, list): - return 0 - - repaired = 0 - marker = AIAgent._TOOL_CALL_ARGUMENTS_CORRUPTION_MARKER - - def _prepend_marker(tool_msg: dict) -> None: - existing = tool_msg.get("content") - if isinstance(existing, str): - if not existing: - tool_msg["content"] = marker - elif not existing.startswith(marker): - tool_msg["content"] = f"{marker}\n{existing}" - return - if existing is None: - tool_msg["content"] = marker - return - try: - existing_text = json.dumps(existing) - except TypeError: - existing_text = str(existing) - tool_msg["content"] = f"{marker}\n{existing_text}" - - message_index = 0 - while message_index < len(messages): - msg = messages[message_index] - if not isinstance(msg, dict) or msg.get("role") != "assistant": - message_index += 1 - continue - - tool_calls = msg.get("tool_calls") - if not isinstance(tool_calls, list) or not tool_calls: - message_index += 1 - continue - - insert_at = message_index + 1 - for tool_call in tool_calls: - if not isinstance(tool_call, dict): - continue - function = tool_call.get("function") - if not isinstance(function, dict): - continue - - arguments = function.get("arguments") - if arguments is None or arguments == "": - function["arguments"] = "{}" - continue - if isinstance(arguments, str) and not arguments.strip(): - function["arguments"] = "{}" - continue - if not isinstance(arguments, str): - continue - - try: - json.loads(arguments) - except json.JSONDecodeError: - tool_call_id = tool_call.get("id") - function_name = function.get("name", "?") - preview = arguments[:80] - log.warning( - "Corrupted tool_call arguments repaired before request " - "(session=%s, message_index=%s, tool_call_id=%s, function=%s, preview=%r)", - session_id or "-", - message_index, - tool_call_id or "-", - function_name, - preview, - ) - function["arguments"] = "{}" - - existing_tool_msg = None - scan_index = message_index + 1 - while scan_index < len(messages): - candidate = messages[scan_index] - if not isinstance(candidate, dict) or candidate.get("role") != "tool": - break - if candidate.get("tool_call_id") == tool_call_id: - existing_tool_msg = candidate - break - scan_index += 1 - - if existing_tool_msg is None: - messages.insert( - insert_at, - { - "role": "tool", - "name": function_name if function_name != "?" else "", - "tool_call_id": tool_call_id, - "content": marker, - }, - ) - insert_at += 1 - else: - _prepend_marker(existing_tool_msg) - - repaired += 1 - - message_index += 1 - - return repaired + """Forwarder — see ``agent.agent_runtime_helpers.sanitize_tool_call_arguments``.""" + from agent.agent_runtime_helpers import sanitize_tool_call_arguments + return sanitize_tool_call_arguments(messages, logger=logger, session_id=session_id) def _should_sanitize_tool_calls(self) -> bool: """Determine if tool_calls need sanitization for strict APIs. @@ -10707,185 +3701,12 @@ class AIAgent: return self.api_mode != "codex_responses" def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default", focus_topic: str = None) -> tuple: - """Compress conversation context and split the session in SQLite. - - Args: - focus_topic: Optional focus string for guided compression — the - summariser will prioritise preserving information related to - this topic. Inspired by Claude Code's ``/compact ``. - - Returns: - (compressed_messages, new_system_prompt) tuple - """ - _pre_msg_count = len(messages) - logger.info( - "context compression started: session=%s messages=%d tokens=~%s model=%s focus=%r", - self.session_id or "none", _pre_msg_count, - f"{approx_tokens:,}" if approx_tokens else "unknown", self.model, - focus_topic, + """Forwarder — see ``agent.conversation_compression.compress_context``.""" + from agent.conversation_compression import compress_context + return compress_context( + self, messages, system_message, + approx_tokens=approx_tokens, task_id=task_id, focus_topic=focus_topic, ) - self._emit_status( - "🗜️ Compacting context — summarizing earlier conversation so I can continue..." - ) - - # Notify external memory provider before compression discards context - if self._memory_manager: - try: - self._memory_manager.on_pre_compress(messages) - except Exception: - pass - - try: - compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens, focus_topic=focus_topic) - except TypeError: - # Plugin context engine with strict signature that doesn't accept - # focus_topic — fall back to calling without it. - compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens) - - summary_error = getattr(self.context_compressor, "_last_summary_error", None) - if summary_error: - if getattr(self, "_last_compression_summary_warning", None) != summary_error: - self._last_compression_summary_warning = summary_error - self._emit_warning( - f"⚠ Compression summary failed: {summary_error}. " - "Inserted a fallback context marker." - ) - else: - # No hard failure — but did the configured aux model error out - # and get recovered by retrying on main? Surface that so users - # know their auxiliary.compression.model setting is broken even - # though compression succeeded. - _aux_fail_model = getattr(self.context_compressor, "_last_aux_model_failure_model", None) - _aux_fail_err = getattr(self.context_compressor, "_last_aux_model_failure_error", None) - if _aux_fail_model: - # Dedup on (model, error) so we don't spam on every compaction - _aux_key = (_aux_fail_model, _aux_fail_err) - if getattr(self, "_last_aux_fallback_warning_key", None) != _aux_key: - self._last_aux_fallback_warning_key = _aux_key - self._emit_warning( - f"ℹ Configured compression model '{_aux_fail_model}' failed " - f"({_aux_fail_err or 'unknown error'}). Recovered using main model — " - "check auxiliary.compression.model in config.yaml." - ) - - todo_snapshot = self._todo_store.format_for_injection() - if todo_snapshot: - compressed.append({"role": "user", "content": todo_snapshot}) - - self._invalidate_system_prompt() - new_system_prompt = self._build_system_prompt(system_message) - self._cached_system_prompt = new_system_prompt - - if self._session_db: - try: - # Propagate title to the new session with auto-numbering - old_title = self._session_db.get_session_title(self.session_id) - # Trigger memory extraction on the old session before it rotates. - self.commit_memory_session(messages) - self._session_db.end_session(self.session_id, "compression") - old_session_id = self.session_id - self.session_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}" - os.environ["HERMES_SESSION_ID"] = self.session_id - try: - from gateway.session_context import _SESSION_ID - _SESSION_ID.set(self.session_id) - except Exception: - pass - # Update session_log_file to point to the new session's JSON file - self.session_log_file = self.logs_dir / f"session_{self.session_id}.json" - self._session_db_created = False - self._session_db.create_session( - session_id=self.session_id, - source=self.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"), - model=self.model, - model_config=self._session_init_model_config, - parent_session_id=old_session_id, - ) - self._session_db_created = True - # Auto-number the title for the continuation session - if old_title: - try: - new_title = self._session_db.get_next_title_in_lineage(old_title) - self._session_db.set_session_title(self.session_id, new_title) - except (ValueError, Exception) as e: - logger.debug("Could not propagate title on compression: %s", e) - self._session_db.update_system_prompt(self.session_id, new_system_prompt) - # Reset flush cursor — new session starts with no messages written - self._last_flushed_db_idx = 0 - except Exception as e: - logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e) - - # Notify the context engine that the session_id rotated because of - # compression (not a fresh /new). Plugin engines (e.g. hermes-lcm) use - # boundary_reason="compression" to preserve DAG lineage across the - # rollover instead of re-initializing fresh per-session state. - # See hermes-lcm#68. Built-in ContextCompressor ignores kwargs. - try: - _old_sid = locals().get("old_session_id") - if _old_sid and hasattr(self.context_compressor, "on_session_start"): - self.context_compressor.on_session_start( - self.session_id or "", - boundary_reason="compression", - old_session_id=_old_sid, - ) - except Exception as _ce_err: - logger.debug("context engine on_session_start (compression): %s", _ce_err) - - # Notify memory providers of the compression-driven session_id rotation - # so provider-cached per-session state (Hindsight's _document_id, - # accumulated turn buffers, counters) refreshes. reset=False because - # the logical conversation continues; only the id and DB row rolled - # over. See #6672. - try: - _old_sid = locals().get("old_session_id") - if _old_sid and self._memory_manager: - self._memory_manager.on_session_switch( - self.session_id or "", - parent_session_id=_old_sid, - reset=False, - reason="compression", - ) - except Exception as _me_err: - logger.debug("memory manager on_session_switch (compression): %s", _me_err) - - # Warn on repeated compressions (quality degrades with each pass) - _cc = self.context_compressor.compression_count - if _cc >= 2: - self._vprint( - f"{self.log_prefix}⚠️ Session compressed {_cc} times — " - f"accuracy may degrade. Consider /new to start fresh.", - force=True, - ) - - # Update token estimate after compaction so pressure calculations - # use the post-compression count, not the stale pre-compression one. - # Use estimate_request_tokens_rough() so tool schemas are included — - # with 50+ tools enabled, schemas alone can add 20-30K tokens, and - # omitting them delays the next compression cycle far past the - # configured threshold (issue #14695). - _compressed_est = estimate_request_tokens_rough( - compressed, - system_prompt=new_system_prompt or "", - tools=self.tools or None, - ) - self.context_compressor.last_prompt_tokens = _compressed_est - self.context_compressor.last_completion_tokens = 0 - - # Clear the file-read dedup cache. After compression the original - # read content is summarised away — if the model re-reads the same - # file it needs the full content, not a "file unchanged" stub. - try: - from tools.file_tools import reset_file_dedup - reset_file_dedup(task_id) - except Exception: - pass - - logger.info( - "context compression done: session=%s messages=%d->%d tokens=~%s", - self.session_id or "none", _pre_msg_count, len(compressed), - f"{_compressed_est:,}", - ) - return compressed, new_system_prompt def _set_tool_guardrail_halt(self, decision: ToolGuardrailDecision) -> None: """Record the first guardrail decision that should stop this turn.""" @@ -10970,89 +3791,9 @@ class AIAgent: def _invoke_tool(self, function_name: str, function_args: dict, effective_task_id: str, tool_call_id: Optional[str] = None, messages: list = None, pre_tool_block_checked: bool = False) -> str: - """Invoke a single tool and return the result string. No display logic. - - Handles both agent-level tools (todo, memory, etc.) and registry-dispatched - tools. Used by the concurrent execution path; the sequential path retains - its own inline invocation for backward-compatible display handling. - """ - # Check plugin hooks for a block directive before executing anything. - block_message: Optional[str] = None - if not pre_tool_block_checked: - try: - from hermes_cli.plugins import get_pre_tool_call_block_message - block_message = get_pre_tool_call_block_message( - function_name, function_args, task_id=effective_task_id or "", - ) - except Exception: - pass - if block_message is not None: - return json.dumps({"error": block_message}, ensure_ascii=False) - - if function_name == "todo": - from tools.todo_tool import todo_tool as _todo_tool - return _todo_tool( - todos=function_args.get("todos"), - merge=function_args.get("merge", False), - store=self._todo_store, - ) - elif function_name == "session_search": - session_db = self._get_session_db_for_recall() - if not session_db: - from hermes_state import format_session_db_unavailable - return json.dumps({"success": False, "error": format_session_db_unavailable()}) - from tools.session_search_tool import session_search as _session_search - return _session_search( - query=function_args.get("query", ""), - role_filter=function_args.get("role_filter"), - limit=function_args.get("limit", 3), - db=session_db, - current_session_id=self.session_id, - ) - elif function_name == "memory": - target = function_args.get("target", "memory") - from tools.memory_tool import memory_tool as _memory_tool - result = _memory_tool( - action=function_args.get("action"), - target=target, - content=function_args.get("content"), - old_text=function_args.get("old_text"), - store=self._memory_store, - ) - # Bridge: notify external memory provider of built-in memory writes - if self._memory_manager and function_args.get("action") in {"add", "replace"}: - try: - self._memory_manager.on_memory_write( - function_args.get("action", ""), - target, - function_args.get("content", ""), - metadata=self._build_memory_write_metadata( - task_id=effective_task_id, - tool_call_id=tool_call_id, - ), - ) - except Exception: - pass - return result - elif self._memory_manager and self._memory_manager.has_tool(function_name): - return self._memory_manager.handle_tool_call(function_name, function_args) - elif function_name == "clarify": - from tools.clarify_tool import clarify_tool as _clarify_tool - return _clarify_tool( - question=function_args.get("question", ""), - choices=function_args.get("choices"), - callback=self.clarify_callback, - ) - elif function_name == "delegate_task": - return self._dispatch_delegate_task(function_args) - else: - return handle_function_call( - function_name, function_args, effective_task_id, - tool_call_id=tool_call_id, - session_id=self.session_id or "", - enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None, - skip_pre_tool_call_hook=True, - ) + """Forwarder — see ``agent.agent_runtime_helpers.invoke_tool``.""" + from agent.agent_runtime_helpers import invoke_tool + return invoke_tool(self, function_name, function_args, effective_task_id, tool_call_id, messages, pre_tool_block_checked) @staticmethod def _wrap_verbose(label: str, text: str, indent: str = " ") -> str: @@ -11080,1069 +3821,19 @@ class AIAgent: return f"{indent}{label}{body}" def _execute_tool_calls_concurrent(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None: - """Execute multiple tool calls concurrently using a thread pool. - - Results are collected in the original tool-call order and appended to - messages so the API sees them in the expected sequence. - """ - tool_calls = assistant_message.tool_calls - num_tools = len(tool_calls) - - # ── Pre-flight: interrupt check ────────────────────────────────── - if self._interrupt_requested: - print(f"{self.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)") - for tc in tool_calls: - messages.append({ - "role": "tool", - "name": tc.function.name, - "content": f"[Tool execution cancelled — {tc.function.name} was skipped due to user interrupt]", - "tool_call_id": tc.id, - }) - return - - # ── Parse args + pre-execution bookkeeping ─────────────────────── - parsed_calls = [] # list of (tool_call, function_name, function_args) - for tool_call in tool_calls: - function_name = tool_call.function.name - - # Reset nudge counters - if function_name == "memory": - self._turns_since_memory = 0 - elif function_name == "skill_manage": - self._iters_since_skill = 0 - - try: - function_args = json.loads(tool_call.function.arguments) - except json.JSONDecodeError: - function_args = {} - if not isinstance(function_args, dict): - function_args = {} - - # Checkpoint for file-mutating tools - if function_name in {"write_file", "patch"} and self._checkpoint_mgr.enabled: - try: - file_path = function_args.get("path", "") - if file_path: - work_dir = self._checkpoint_mgr.get_working_dir_for_path(file_path) - self._checkpoint_mgr.ensure_checkpoint(work_dir, f"before {function_name}") - except Exception: - pass - - # Checkpoint before destructive terminal commands - if function_name == "terminal" and self._checkpoint_mgr.enabled: - try: - cmd = function_args.get("command", "") - if _is_destructive_command(cmd): - cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd()) - self._checkpoint_mgr.ensure_checkpoint( - cwd, f"before terminal: {cmd[:60]}" - ) - except Exception: - pass - - block_result = None - blocked_by_guardrail = False - try: - from hermes_cli.plugins import get_pre_tool_call_block_message - block_message = get_pre_tool_call_block_message( - function_name, function_args, task_id=effective_task_id or "", - ) - except Exception: - block_message = None - - if block_message is not None: - block_result = json.dumps({"error": block_message}, ensure_ascii=False) - else: - guardrail_decision = self._tool_guardrails.before_call(function_name, function_args) - if not guardrail_decision.allows_execution: - block_result = self._guardrail_block_result(guardrail_decision) - blocked_by_guardrail = True - - parsed_calls.append((tool_call, function_name, function_args, block_result, blocked_by_guardrail)) - - # ── Logging / callbacks ────────────────────────────────────────── - tool_names_str = ", ".join(name for _, name, _, _, _ in parsed_calls) - if not self.quiet_mode: - print(f" ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}") - for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls, 1): - args_str = json.dumps(args, ensure_ascii=False) - if self.verbose_logging: - print(f" 📞 Tool {i}: {name}({list(args.keys())})") - print(self._wrap_verbose("Args: ", json.dumps(args, indent=2, ensure_ascii=False))) - else: - args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str - print(f" 📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}") - - for tc, name, args, block_result, blocked_by_guardrail in parsed_calls: - if block_result is not None: - continue - if self.tool_progress_callback: - try: - preview = _build_tool_preview(name, args) - self.tool_progress_callback("tool.started", name, preview, args) - except Exception as cb_err: - logging.debug(f"Tool progress callback error: {cb_err}") - - for tc, name, args, block_result, blocked_by_guardrail in parsed_calls: - if block_result is not None: - continue - if self.tool_start_callback: - try: - self.tool_start_callback(tc.id, name, args) - except Exception as cb_err: - logging.debug(f"Tool start callback error: {cb_err}") - - # ── Concurrent execution ───────────────────────────────────────── - # Each slot holds (function_name, function_args, function_result, duration, error_flag, blocked_flag) - results = [None] * num_tools - for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls): - if block_result is not None: - results[i] = (name, args, block_result, 0.0, True, True) - - # Touch activity before launching workers so the gateway knows - # we're executing tools (not stuck). - self._current_tool = tool_names_str - self._touch_activity(f"executing {num_tools} tools concurrently: {tool_names_str}") - - # Capture CLI callbacks from the agent thread so worker threads can - # register them locally. Without this, _get_approval_callback() in - # terminal_tool returns None in ThreadPoolExecutor workers, causing - # the dangerous-command prompt to fall back to input() — which - # deadlocks against prompt_toolkit's raw terminal mode (#13617). - _parent_approval_cb = _get_approval_callback() - _parent_sudo_cb = _get_sudo_password_callback() - - def _run_tool(index, tool_call, function_name, function_args): - """Worker function executed in a thread.""" - # Register this worker tid so the agent can fan out an interrupt - # to it — see AIAgent.interrupt(). Must happen first thing, and - # must be paired with discard + clear in the finally block. - _worker_tid = threading.current_thread().ident - with self._tool_worker_threads_lock: - self._tool_worker_threads.add(_worker_tid) - # Race: if the agent was interrupted between fan-out (which - # snapshotted an empty/earlier set) and our registration, apply - # the interrupt to our own tid now so is_interrupted() inside - # the tool returns True on the next poll. - if self._interrupt_requested: - try: - _set_interrupt(True, _worker_tid) - except Exception: - pass - # Set the activity callback on THIS worker thread so - # _wait_for_process (terminal commands) can fire heartbeats. - # The callback is thread-local; the main thread's callback - # is invisible to worker threads. - try: - from tools.environments.base import set_activity_callback - set_activity_callback(self._touch_activity) - except Exception: - pass - # Propagate approval/sudo callbacks to this worker thread. - # Mirrors cli.py run_agent() pattern (GHSA-qg5c-hvr5-hjgr). - if _parent_approval_cb is not None: - try: - _set_approval_callback(_parent_approval_cb) - except Exception: - pass - if _parent_sudo_cb is not None: - try: - _set_sudo_password_callback(_parent_sudo_cb) - except Exception: - pass - start = time.time() - try: - result = self._invoke_tool( - function_name, - function_args, - effective_task_id, - tool_call.id, - messages=messages, - pre_tool_block_checked=True, - ) - except Exception as tool_error: - result = f"Error executing tool '{function_name}': {tool_error}" - logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True) - duration = time.time() - start - is_error, _ = _detect_tool_failure(function_name, result) - if is_error: - logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200]) - else: - logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result)) - results[index] = (function_name, function_args, result, duration, is_error, False) - # Tear down worker-tid tracking. Clear any interrupt bit we may - # have set so the next task scheduled onto this recycled tid - # starts with a clean slate. - with self._tool_worker_threads_lock: - self._tool_worker_threads.discard(_worker_tid) - try: - _set_interrupt(False, _worker_tid) - except Exception: - pass - # Clear thread-local callbacks so a recycled worker thread - # doesn't hold stale references to a disposed CLI instance. - try: - _set_approval_callback(None) - _set_sudo_password_callback(None) - except Exception: - pass - - # Start spinner for CLI mode (skip when TUI handles tool progress) - spinner = None - if self._should_emit_quiet_tool_messages() and self._should_start_quiet_spinner(): - face = random.choice(KawaiiSpinner.get_waiting_faces()) - spinner = KawaiiSpinner(f"{face} ⚡ running {num_tools} tools concurrently", spinner_type='dots', print_fn=self._print_fn) - spinner.start() - - try: - runnable_calls = [ - (i, tc, name, args) - for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls) - if block_result is None - ] - futures = [] - if runnable_calls: - max_workers = min(len(runnable_calls), _MAX_TOOL_WORKERS) - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - for i, tc, name, args in runnable_calls: - # Propagate ContextVars (e.g. _approval_session_key); mirrors asyncio.to_thread. - ctx = contextvars.copy_context() - f = executor.submit(ctx.run, _run_tool, i, tc, name, args) - futures.append(f) - - # Wait for all to complete with periodic heartbeats so the - # gateway's inactivity monitor doesn't kill us during long - # concurrent tool batches. Also check for user interrupts - # so we don't block indefinitely when the user sends /stop - # or a new message during concurrent tool execution. - _conc_start = time.time() - _interrupt_logged = False - while True: - done, not_done = concurrent.futures.wait( - futures, timeout=5.0, - ) - if not not_done: - break - - # Check for interrupt — the per-thread interrupt signal - # already causes individual tools (terminal, execute_code) - # to abort, but tools without interrupt checks (web_search, - # read_file) will run to completion. Cancel any futures - # that haven't started yet so we don't block on them. - if self._interrupt_requested: - if not _interrupt_logged: - _interrupt_logged = True - self._vprint( - f"{self.log_prefix}⚡ Interrupt: cancelling " - f"{len(not_done)} pending concurrent tool(s)", - force=True, - ) - for f in not_done: - f.cancel() - # Give already-running tools a moment to notice the - # per-thread interrupt signal and exit gracefully. - concurrent.futures.wait(not_done, timeout=3.0) - break - - _conc_elapsed = int(time.time() - _conc_start) - # Heartbeat every ~30s (6 × 5s poll intervals) - if _conc_elapsed > 0 and _conc_elapsed % 30 < 6: - _still_running = [ - parsed_calls[futures.index(f)][1] - for f in not_done - if f in futures - ] - self._touch_activity( - f"concurrent tools running ({_conc_elapsed}s, " - f"{len(not_done)} remaining: {', '.join(_still_running[:3])})" - ) - finally: - if spinner: - # Build a summary message for the spinner stop - completed = sum(1 for r in results if r is not None) - total_dur = sum(r[3] for r in results if r is not None) - spinner.stop(f"⚡ {completed}/{num_tools} tools completed in {total_dur:.1f}s total") - - # ── Post-execution: display per-tool results ───────────────────── - for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls): - r = results[i] - blocked = False - if r is None: - # Tool was cancelled (interrupt) or thread didn't return - if self._interrupt_requested: - function_result = f"[Tool execution cancelled — {name} was skipped due to user interrupt]" - else: - function_result = f"Error executing tool '{name}': thread did not return a result" - tool_duration = 0.0 - else: - function_name, function_args, function_result, tool_duration, is_error, blocked = r - - if not blocked: - function_result = self._append_guardrail_observation( - function_name, - function_args, - function_result, - failed=is_error, - ) - - if is_error: - _err_text = _multimodal_text_summary(function_result) - result_preview = _err_text[:200] if len(_err_text) > 200 else _err_text - logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview) - - # Track file-mutation outcome for the turn-end verifier. - # `blocked` calls never actually ran — don't let a guardrail - # block count as either a failure or a success. - if not blocked: - try: - self._record_file_mutation_result( - function_name, function_args, function_result, is_error, - ) - except Exception as _ver_err: - logging.debug("file-mutation verifier record failed: %s", _ver_err) - - if not blocked and self.tool_progress_callback: - try: - self.tool_progress_callback( - "tool.completed", function_name, None, None, - duration=tool_duration, is_error=is_error, - ) - except Exception as cb_err: - logging.debug(f"Tool progress callback error: {cb_err}") - - if self.verbose_logging: - logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s") - logging.debug(f"Tool result ({len(function_result)} chars): {function_result}") - - # Print cute message per tool - if self._should_emit_quiet_tool_messages(): - cute_msg = _get_cute_tool_message_impl(name, args, tool_duration, result=function_result) - self._safe_print(f" {cute_msg}") - elif not self.quiet_mode: - _preview_str = _multimodal_text_summary(function_result) - if self.verbose_logging: - print(f" ✅ Tool {i+1} completed in {tool_duration:.2f}s") - print(self._wrap_verbose("Result: ", _preview_str)) - else: - response_preview = _preview_str[:self.log_prefix_chars] + "..." if len(_preview_str) > self.log_prefix_chars else _preview_str - print(f" ✅ Tool {i+1} completed in {tool_duration:.2f}s - {response_preview}") - - self._current_tool = None - self._touch_activity(f"tool completed: {name} ({tool_duration:.1f}s)") - - if not blocked and self.tool_complete_callback: - try: - self.tool_complete_callback(tc.id, name, args, function_result) - except Exception as cb_err: - logging.debug(f"Tool complete callback error: {cb_err}") - - function_result = maybe_persist_tool_result( - content=function_result, - tool_name=name, - tool_use_id=tc.id, - env=get_active_env(effective_task_id), - ) if not _is_multimodal_tool_result(function_result) else function_result - - subdir_hints = self._subdirectory_hints.check_tool_call(name, args) - if subdir_hints: - if _is_multimodal_tool_result(function_result): - # Append the hint to the text summary part so the model - # still sees it; don't touch the image blocks. - _append_subdir_hint_to_multimodal(function_result, subdir_hints) - else: - function_result += subdir_hints - - # Unwrap _multimodal dicts to an OpenAI-style content list so any - # vision-capable provider receives [{type:text},{type:image_url}] - # rather than a raw Python dict. The Anthropic adapter already - # accepts content lists; vision-capable OpenAI-compatible servers - # (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively. - # Text-only servers get a string-safe fallback here so a rejected - # image tool result never poisons canonical session history. - # String results pass through unchanged. - _tool_content = self._tool_result_content_for_active_model(name, function_result) - tool_msg = { - "role": "tool", - "name": name, - "content": _tool_content, - "tool_call_id": tc.id, - } - messages.append(tool_msg) - - # ── Per-tool /steer drain ─────────────────────────────────── - # Same as the sequential path: drain between each collected - # result so the steer lands as early as possible. - self._apply_pending_steer_to_tool_results(messages, 1) - - # ── Per-turn aggregate budget enforcement ───────────────────────── - num_tools = len(parsed_calls) - if num_tools > 0: - turn_tool_msgs = messages[-num_tools:] - enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id)) - - # ── /steer injection ────────────────────────────────────────────── - # Append any pending user steer text to the last tool result so the - # agent sees it on its next iteration. Runs AFTER budget enforcement - # so the steer marker is never truncated. See steer() for details. - if num_tools > 0: - self._apply_pending_steer_to_tool_results(messages, num_tools) + """Forwarder — see ``agent.tool_executor.execute_tool_calls_concurrent``.""" + from agent.tool_executor import execute_tool_calls_concurrent + return execute_tool_calls_concurrent(self, assistant_message, messages, effective_task_id, api_call_count) def _execute_tool_calls_sequential(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None: - """Execute tool calls sequentially (original behavior). Used for single calls or interactive tools.""" - for i, tool_call in enumerate(assistant_message.tool_calls, 1): - # SAFETY: check interrupt BEFORE starting each tool. - # If the user sent "stop" during a previous tool's execution, - # do NOT start any more tools -- skip them all immediately. - if self._interrupt_requested: - remaining_calls = assistant_message.tool_calls[i-1:] - if remaining_calls: - self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)", force=True) - for skipped_tc in remaining_calls: - skipped_name = skipped_tc.function.name - skip_msg = { - "role": "tool", - "name": skipped_name, - "content": f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]", - "tool_call_id": skipped_tc.id, - } - messages.append(skip_msg) - break - - function_name = tool_call.function.name - - try: - function_args = json.loads(tool_call.function.arguments) - except json.JSONDecodeError as e: - logging.warning(f"Unexpected JSON error after validation: {e}") - function_args = {} - if not isinstance(function_args, dict): - function_args = {} - - # Check plugin hooks for a block directive before executing. - _block_msg: Optional[str] = None - try: - from hermes_cli.plugins import get_pre_tool_call_block_message - _block_msg = get_pre_tool_call_block_message( - function_name, function_args, task_id=effective_task_id or "", - ) - except Exception: - pass - - _guardrail_block_decision: ToolGuardrailDecision | None = None - if _block_msg is None: - guardrail_decision = self._tool_guardrails.before_call(function_name, function_args) - if not guardrail_decision.allows_execution: - _guardrail_block_decision = guardrail_decision - - _execution_blocked = _block_msg is not None or _guardrail_block_decision is not None - - if _execution_blocked: - # Tool blocked by plugin or guardrail policy — skip counters, - # callbacks, checkpointing, activity mutation, and real execution. - pass - # Reset nudge counters when the relevant tool is actually used - elif function_name == "memory": - self._turns_since_memory = 0 - elif function_name == "skill_manage": - self._iters_since_skill = 0 - - if not self.quiet_mode: - args_str = json.dumps(function_args, ensure_ascii=False) - if self.verbose_logging: - print(f" 📞 Tool {i}: {function_name}({list(function_args.keys())})") - print(self._wrap_verbose("Args: ", json.dumps(function_args, indent=2, ensure_ascii=False))) - else: - args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str - print(f" 📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}") - - if not _execution_blocked: - self._current_tool = function_name - self._touch_activity(f"executing tool: {function_name}") - - # Set activity callback for long-running tool execution (terminal - # commands, etc.) so the gateway's inactivity monitor doesn't kill - # the agent while a command is running. - if not _execution_blocked: - try: - from tools.environments.base import set_activity_callback - set_activity_callback(self._touch_activity) - except Exception: - pass - - if not _execution_blocked and self.tool_progress_callback: - try: - preview = _build_tool_preview(function_name, function_args) - self.tool_progress_callback("tool.started", function_name, preview, function_args) - except Exception as cb_err: - logging.debug(f"Tool progress callback error: {cb_err}") - - if not _execution_blocked and self.tool_start_callback: - try: - self.tool_start_callback(tool_call.id, function_name, function_args) - except Exception as cb_err: - logging.debug(f"Tool start callback error: {cb_err}") - - # Checkpoint: snapshot working dir before file-mutating tools - if not _execution_blocked and function_name in {"write_file", "patch"} and self._checkpoint_mgr.enabled: - try: - file_path = function_args.get("path", "") - if file_path: - work_dir = self._checkpoint_mgr.get_working_dir_for_path(file_path) - self._checkpoint_mgr.ensure_checkpoint( - work_dir, f"before {function_name}" - ) - except Exception: - pass # never block tool execution - - # Checkpoint before destructive terminal commands - if not _execution_blocked and function_name == "terminal" and self._checkpoint_mgr.enabled: - try: - cmd = function_args.get("command", "") - if _is_destructive_command(cmd): - cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd()) - self._checkpoint_mgr.ensure_checkpoint( - cwd, f"before terminal: {cmd[:60]}" - ) - except Exception: - pass # never block tool execution - - tool_start_time = time.time() - - if _block_msg is not None: - # Tool blocked by plugin policy — return error without executing. - function_result = json.dumps({"error": _block_msg}, ensure_ascii=False) - tool_duration = 0.0 - elif _guardrail_block_decision is not None: - # Tool blocked by tool-loop guardrail — synthesize exactly one - # tool result for the original tool_call_id without executing. - function_result = self._guardrail_block_result(_guardrail_block_decision) - tool_duration = 0.0 - elif function_name == "todo": - from tools.todo_tool import todo_tool as _todo_tool - function_result = _todo_tool( - todos=function_args.get("todos"), - merge=function_args.get("merge", False), - store=self._todo_store, - ) - tool_duration = time.time() - tool_start_time - if self._should_emit_quiet_tool_messages(): - self._vprint(f" {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}") - elif function_name == "session_search": - session_db = self._get_session_db_for_recall() - if not session_db: - from hermes_state import format_session_db_unavailable - function_result = json.dumps({"success": False, "error": format_session_db_unavailable()}) - else: - from tools.session_search_tool import session_search as _session_search - function_result = _session_search( - query=function_args.get("query", ""), - role_filter=function_args.get("role_filter"), - limit=function_args.get("limit", 3), - db=session_db, - current_session_id=self.session_id, - ) - tool_duration = time.time() - tool_start_time - if self._should_emit_quiet_tool_messages(): - self._vprint(f" {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}") - elif function_name == "memory": - target = function_args.get("target", "memory") - from tools.memory_tool import memory_tool as _memory_tool - function_result = _memory_tool( - action=function_args.get("action"), - target=target, - content=function_args.get("content"), - old_text=function_args.get("old_text"), - store=self._memory_store, - ) - # Bridge: notify external memory provider of built-in memory writes - if self._memory_manager and function_args.get("action") in {"add", "replace"}: - try: - self._memory_manager.on_memory_write( - function_args.get("action", ""), - target, - function_args.get("content", ""), - metadata=self._build_memory_write_metadata( - task_id=effective_task_id, - tool_call_id=getattr(tool_call, "id", None), - ), - ) - except Exception: - pass - tool_duration = time.time() - tool_start_time - if self._should_emit_quiet_tool_messages(): - self._vprint(f" {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}") - elif function_name == "clarify": - from tools.clarify_tool import clarify_tool as _clarify_tool - function_result = _clarify_tool( - question=function_args.get("question", ""), - choices=function_args.get("choices"), - callback=self.clarify_callback, - ) - tool_duration = time.time() - tool_start_time - if self._should_emit_quiet_tool_messages(): - self._vprint(f" {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}") - elif function_name == "delegate_task": - tasks_arg = function_args.get("tasks") - if tasks_arg and isinstance(tasks_arg, list): - spinner_label = f"🔀 delegating {len(tasks_arg)} tasks" - else: - goal_preview = (function_args.get("goal") or "")[:30] - spinner_label = f"🔀 {goal_preview}" if goal_preview else "🔀 delegating" - spinner = None - if self._should_emit_quiet_tool_messages() and self._should_start_quiet_spinner(): - face = random.choice(KawaiiSpinner.get_waiting_faces()) - spinner = KawaiiSpinner(f"{face} {spinner_label}", spinner_type='dots', print_fn=self._print_fn) - spinner.start() - self._delegate_spinner = spinner - _delegate_result = None - try: - function_result = self._dispatch_delegate_task(function_args) - _delegate_result = function_result - finally: - self._delegate_spinner = None - tool_duration = time.time() - tool_start_time - cute_msg = _get_cute_tool_message_impl('delegate_task', function_args, tool_duration, result=_delegate_result) - if spinner: - spinner.stop(cute_msg) - elif self._should_emit_quiet_tool_messages(): - self._vprint(f" {cute_msg}") - elif self._context_engine_tool_names and function_name in self._context_engine_tool_names: - # Context engine tools (lcm_grep, lcm_describe, lcm_expand, etc.) - spinner = None - if self._should_emit_quiet_tool_messages(): - face = random.choice(KawaiiSpinner.get_waiting_faces()) - emoji = _get_tool_emoji(function_name) - preview = _build_tool_preview(function_name, function_args) or function_name - spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=self._print_fn) - spinner.start() - _ce_result = None - try: - function_result = self.context_compressor.handle_tool_call(function_name, function_args, messages=messages) - _ce_result = function_result - except Exception as tool_error: - function_result = json.dumps({"error": f"Context engine tool '{function_name}' failed: {tool_error}"}) - logger.error("context_engine.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True) - finally: - tool_duration = time.time() - tool_start_time - cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_ce_result) - if spinner: - spinner.stop(cute_msg) - elif self._should_emit_quiet_tool_messages(): - self._vprint(f" {cute_msg}") - elif self._memory_manager and self._memory_manager.has_tool(function_name): - # Memory provider tools (hindsight_retain, honcho_search, etc.) - # These are not in the tool registry — route through MemoryManager. - spinner = None - if self._should_emit_quiet_tool_messages() and self._should_start_quiet_spinner(): - face = random.choice(KawaiiSpinner.get_waiting_faces()) - emoji = _get_tool_emoji(function_name) - preview = _build_tool_preview(function_name, function_args) or function_name - spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=self._print_fn) - spinner.start() - _mem_result = None - try: - function_result = self._memory_manager.handle_tool_call(function_name, function_args) - _mem_result = function_result - except Exception as tool_error: - function_result = json.dumps({"error": f"Memory tool '{function_name}' failed: {tool_error}"}) - logger.error("memory_manager.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True) - finally: - tool_duration = time.time() - tool_start_time - cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_mem_result) - if spinner: - spinner.stop(cute_msg) - elif self._should_emit_quiet_tool_messages(): - self._vprint(f" {cute_msg}") - elif self.quiet_mode: - spinner = None - if self._should_emit_quiet_tool_messages() and self._should_start_quiet_spinner(): - face = random.choice(KawaiiSpinner.get_waiting_faces()) - emoji = _get_tool_emoji(function_name) - preview = _build_tool_preview(function_name, function_args) or function_name - spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=self._print_fn) - spinner.start() - _spinner_result = None - try: - function_result = handle_function_call( - function_name, function_args, effective_task_id, - tool_call_id=tool_call.id, - session_id=self.session_id or "", - enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None, - skip_pre_tool_call_hook=True, - ) - _spinner_result = function_result - except Exception as tool_error: - function_result = f"Error executing tool '{function_name}': {tool_error}" - logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True) - finally: - tool_duration = time.time() - tool_start_time - cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result) - if spinner: - spinner.stop(cute_msg) - elif self._should_emit_quiet_tool_messages(): - self._vprint(f" {cute_msg}") - else: - try: - function_result = handle_function_call( - function_name, function_args, effective_task_id, - tool_call_id=tool_call.id, - session_id=self.session_id or "", - enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None, - skip_pre_tool_call_hook=True, - ) - except Exception as tool_error: - function_result = f"Error executing tool '{function_name}': {tool_error}" - logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True) - tool_duration = time.time() - tool_start_time - - if isinstance(function_result, str): - result_preview = function_result if self.verbose_logging else ( - function_result[:200] if len(function_result) > 200 else function_result - ) - _result_len = len(function_result) - else: - # Multimodal dict result (_multimodal=True) — not sliceable as string - result_preview = function_result - _result_len = len(str(function_result)) - - # Log tool errors to the persistent error log so [error] tags - # in the UI always have a corresponding detailed entry on disk. - _is_error_result, _ = _detect_tool_failure(function_name, function_result) - if not _execution_blocked: - function_result = self._append_guardrail_observation( - function_name, - function_args, - function_result, - failed=_is_error_result, - ) - result_preview = function_result if self.verbose_logging else ( - function_result[:200] if len(function_result) > 200 else function_result - ) - if _is_error_result: - logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview) - else: - logger.info("tool %s completed (%.2fs, %d chars)", function_name, tool_duration, _result_len) - - # Track file-mutation outcome for the turn-end verifier. See - # the concurrent path for the rationale; both paths must feed - # the same state so the footer reflects every tool call in the - # turn, not just the parallel ones. - if not _execution_blocked: - try: - self._record_file_mutation_result( - function_name, function_args, function_result, _is_error_result, - ) - except Exception as _ver_err: - logging.debug("file-mutation verifier record failed: %s", _ver_err) - - if not _execution_blocked and self.tool_progress_callback: - try: - self.tool_progress_callback( - "tool.completed", function_name, None, None, - duration=tool_duration, is_error=_is_error_result, - ) - except Exception as cb_err: - logging.debug(f"Tool progress callback error: {cb_err}") - - self._current_tool = None - self._touch_activity(f"tool completed: {function_name} ({tool_duration:.1f}s)") - - if self.verbose_logging: - logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s") - _log_result = _multimodal_text_summary(function_result) - logging.debug(f"Tool result ({len(_log_result)} chars): {_log_result}") - - if not _execution_blocked and self.tool_complete_callback: - try: - self.tool_complete_callback(tool_call.id, function_name, function_args, function_result) - except Exception as cb_err: - logging.debug(f"Tool complete callback error: {cb_err}") - - function_result = maybe_persist_tool_result( - content=function_result, - tool_name=function_name, - tool_use_id=tool_call.id, - env=get_active_env(effective_task_id), - ) if not _is_multimodal_tool_result(function_result) else function_result - - # Discover subdirectory context files from tool arguments - subdir_hints = self._subdirectory_hints.check_tool_call(function_name, function_args) - if subdir_hints: - if _is_multimodal_tool_result(function_result): - _append_subdir_hint_to_multimodal(function_result, subdir_hints) - else: - function_result += subdir_hints - - # Unwrap _multimodal dicts to an OpenAI-style content list - # (see parallel path for rationale). String results pass through. - _tool_content = self._tool_result_content_for_active_model(function_name, function_result) - tool_msg = { - "role": "tool", - "name": function_name, - "content": _tool_content, - "tool_call_id": tool_call.id - } - messages.append(tool_msg) - - # ── Per-tool /steer drain ─────────────────────────────────── - # Drain pending steer BETWEEN individual tool calls so the - # injection lands as soon as a tool finishes — not after the - # entire batch. The model sees it on the next API iteration. - self._apply_pending_steer_to_tool_results(messages, 1) - - if not self.quiet_mode: - if self.verbose_logging: - print(f" ✅ Tool {i} completed in {tool_duration:.2f}s") - print(self._wrap_verbose("Result: ", function_result)) - else: - _fr_str = function_result if isinstance(function_result, str) else str(function_result) - response_preview = _fr_str[:self.log_prefix_chars] + "..." if len(_fr_str) > self.log_prefix_chars else _fr_str - print(f" ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}") - - if self._interrupt_requested and i < len(assistant_message.tool_calls): - remaining = len(assistant_message.tool_calls) - i - self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)", force=True) - for skipped_tc in assistant_message.tool_calls[i:]: - skipped_name = skipped_tc.function.name - skip_msg = { - "role": "tool", - "name": skipped_name, - "content": f"[Tool execution skipped — {skipped_name} was not started. User sent a new message]", - "tool_call_id": skipped_tc.id - } - messages.append(skip_msg) - break - - if self.tool_delay > 0 and i < len(assistant_message.tool_calls): - time.sleep(self.tool_delay) - - # ── Per-turn aggregate budget enforcement ───────────────────────── - num_tools_seq = len(assistant_message.tool_calls) - if num_tools_seq > 0: - enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id)) - - # ── /steer injection ────────────────────────────────────────────── - # See _execute_tool_calls_parallel for the rationale. Same hook, - # applied to sequential execution as well. - if num_tools_seq > 0: - self._apply_pending_steer_to_tool_results(messages, num_tools_seq) - + """Forwarder — see ``agent.tool_executor.execute_tool_calls_sequential``.""" + from agent.tool_executor import execute_tool_calls_sequential + return execute_tool_calls_sequential(self, assistant_message, messages, effective_task_id, api_call_count) def _handle_max_iterations(self, messages: list, api_call_count: int) -> str: - """Request a summary when max iterations are reached. Returns the final response text.""" - print(f"⚠️ Reached maximum iterations ({self.max_iterations}). Requesting summary...") - - summary_request = ( - "You've reached the maximum number of tool-calling iterations allowed. " - "Please provide a final response summarizing what you've found and accomplished so far, " - "without calling any more tools." - ) - messages.append({"role": "user", "content": summary_request}) - - try: - # Build API messages, stripping internal-only fields - # (finish_reason, reasoning) that strict APIs like Mistral reject with 422 - _needs_sanitize = self._should_sanitize_tool_calls() - api_messages = [] - for msg in messages: - api_msg = msg.copy() - self._copy_reasoning_content_for_api(msg, api_msg) - for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"): - api_msg.pop(internal_field, None) - if _needs_sanitize: - self._sanitize_tool_calls_for_strict_api(api_msg) - api_messages.append(api_msg) - - effective_system = self._cached_system_prompt or "" - if self.ephemeral_system_prompt: - effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip() - if effective_system: - api_messages = [{"role": "system", "content": effective_system}] + api_messages - if self.prefill_messages: - sys_offset = 1 if effective_system else 0 - for idx, pfm in enumerate(self.prefill_messages): - api_messages.insert(sys_offset + idx, pfm.copy()) - - # Same safety net as the main loop: repair tool-call/result - # pairing before asking for a final summary. Compression and - # session resume can leave a tool result whose parent assistant - # tool_call was summarized away; Responses API rejects that as - # "No tool call found for function call output". - api_messages = self._sanitize_api_messages(api_messages) - - # Same safety net as the main loop: drop thinking-only assistant - # turns so Anthropic-family providers don't 400 the summary call. - api_messages = self._drop_thinking_only_and_merge_users(api_messages) - - summary_extra_body = {} - try: - from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE as _OMIT_TEMP - except Exception: - _fixed_temperature_for_model = None - _OMIT_TEMP = None - _raw_summary_temp = ( - _fixed_temperature_for_model(self.model, self.base_url) - if _fixed_temperature_for_model is not None - else None - ) - _omit_summary_temperature = _raw_summary_temp is _OMIT_TEMP - _summary_temperature = None if _omit_summary_temperature else _raw_summary_temp - _is_nous = "nousresearch" in self._base_url_lower - # LM Studio uses top-level `reasoning_effort` (not extra_body.reasoning). - # Mirror ChatCompletionsTransport.build_kwargs() so the summary path - # — which calls chat.completions.create() directly without going - # through the transport — sends the same shape the transport does. - _is_lmstudio_summary = ( - (self.provider or "").strip().lower() == "lmstudio" - and self._supports_reasoning_extra_body() - ) - _lm_reasoning_effort: str | None = ( - self._resolve_lmstudio_summary_reasoning_effort() - if _is_lmstudio_summary else None - ) - if not _is_lmstudio_summary and self._supports_reasoning_extra_body(): - if self.reasoning_config is not None: - summary_extra_body["reasoning"] = self.reasoning_config - else: - summary_extra_body["reasoning"] = { - "enabled": True, - "effort": "medium" - } - if _is_nous: - from agent.portal_tags import nous_portal_tags as _portal_tags - summary_extra_body["tags"] = _portal_tags() - - if self.api_mode == "codex_responses": - codex_kwargs = self._build_api_kwargs(api_messages) - codex_kwargs.pop("tools", None) - summary_response = self._run_codex_stream(codex_kwargs) - _ct_sum = self._get_transport() - _cnr_sum = _ct_sum.normalize_response(summary_response) - final_response = (_cnr_sum.content or "").strip() - else: - summary_kwargs = { - "model": self.model, - "messages": api_messages, - } - if _summary_temperature is not None: - summary_kwargs["temperature"] = _summary_temperature - if self.max_tokens is not None: - summary_kwargs.update(self._max_tokens_param(self.max_tokens)) - if _lm_reasoning_effort is not None: - summary_kwargs["reasoning_effort"] = _lm_reasoning_effort - - # Include provider routing preferences - provider_preferences = {} - if self.providers_allowed: - provider_preferences["only"] = self.providers_allowed - if self.providers_ignored: - provider_preferences["ignore"] = self.providers_ignored - if self.providers_order: - provider_preferences["order"] = self.providers_order - if self.provider_sort: - provider_preferences["sort"] = self.provider_sort - if provider_preferences and ( - (self.provider or "").strip().lower() == "openrouter" - or self._is_openrouter_url() - ): - summary_extra_body["provider"] = provider_preferences - - # Pareto Code router plugin — model-gated. Same shape as - # the main-loop emission so summary calls on - # openrouter/pareto-code respect the user's coding-score floor. - if ( - self.model == "openrouter/pareto-code" - and ( - (self.provider or "").strip().lower() == "openrouter" - or self._is_openrouter_url() - ) - and self.openrouter_min_coding_score is not None - and self.openrouter_min_coding_score != "" - ): - try: - _ps = float(self.openrouter_min_coding_score) - except (TypeError, ValueError): - _ps = None - if _ps is not None and 0.0 <= _ps <= 1.0: - summary_extra_body["plugins"] = [ - {"id": "pareto-router", "min_coding_score": _ps} - ] - - if summary_extra_body: - summary_kwargs["extra_body"] = summary_extra_body - - if self.api_mode == "anthropic_messages": - _tsum = self._get_transport() - _ant_kw = _tsum.build_kwargs(model=self.model, messages=api_messages, tools=None, - max_tokens=self.max_tokens, reasoning_config=self.reasoning_config, - is_oauth=self._is_anthropic_oauth, - preserve_dots=self._anthropic_preserve_dots()) - summary_response = self._anthropic_messages_create(_ant_kw) - _summary_result = _tsum.normalize_response(summary_response, strip_tool_prefix=self._is_anthropic_oauth) - final_response = (_summary_result.content or "").strip() - else: - summary_response = self._ensure_primary_openai_client(reason="iteration_limit_summary").chat.completions.create(**summary_kwargs) - _summary_result = self._get_transport().normalize_response(summary_response) - final_response = (_summary_result.content or "").strip() - - if final_response: - if "" in final_response: - final_response = re.sub(r'.*?\s*', '', final_response, flags=re.DOTALL).strip() - if final_response: - messages.append({"role": "assistant", "content": final_response}) - else: - final_response = "I reached the iteration limit and couldn't generate a summary." - else: - # Retry summary generation - if self.api_mode == "codex_responses": - codex_kwargs = self._build_api_kwargs(api_messages) - codex_kwargs.pop("tools", None) - retry_response = self._run_codex_stream(codex_kwargs) - _ct_retry = self._get_transport() - _cnr_retry = _ct_retry.normalize_response(retry_response) - final_response = (_cnr_retry.content or "").strip() - elif self.api_mode == "anthropic_messages": - _tretry = self._get_transport() - _ant_kw2 = _tretry.build_kwargs(model=self.model, messages=api_messages, tools=None, - is_oauth=self._is_anthropic_oauth, - max_tokens=self.max_tokens, reasoning_config=self.reasoning_config, - preserve_dots=self._anthropic_preserve_dots()) - retry_response = self._anthropic_messages_create(_ant_kw2) - _retry_result = _tretry.normalize_response(retry_response, strip_tool_prefix=self._is_anthropic_oauth) - final_response = (_retry_result.content or "").strip() - else: - summary_kwargs = { - "model": self.model, - "messages": api_messages, - } - if _summary_temperature is not None: - summary_kwargs["temperature"] = _summary_temperature - if self.max_tokens is not None: - summary_kwargs.update(self._max_tokens_param(self.max_tokens)) - if _lm_reasoning_effort is not None: - summary_kwargs["reasoning_effort"] = _lm_reasoning_effort - if summary_extra_body: - summary_kwargs["extra_body"] = summary_extra_body - - summary_response = self._ensure_primary_openai_client(reason="iteration_limit_summary_retry").chat.completions.create(**summary_kwargs) - _retry_result = self._get_transport().normalize_response(summary_response) - final_response = (_retry_result.content or "").strip() - - if final_response: - if "" in final_response: - final_response = re.sub(r'.*?\s*', '', final_response, flags=re.DOTALL).strip() - if final_response: - messages.append({"role": "assistant", "content": final_response}) - else: - final_response = "I reached the iteration limit and couldn't generate a summary." - else: - final_response = "I reached the iteration limit and couldn't generate a summary." - - except Exception as e: - logging.warning(f"Failed to get summary response: {e}") - final_response = f"I reached the maximum iterations ({self.max_iterations}) but couldn't summarize. Error: {str(e)}" - - return final_response + """Forwarder — see ``agent.chat_completion_helpers.handle_max_iterations``.""" + from agent.chat_completion_helpers import handle_max_iterations + return handle_max_iterations(self, messages, api_call_count) def run_conversation( self, @@ -12153,3932 +3844,9 @@ class AIAgent: stream_callback: Optional[callable] = None, persist_user_message: Optional[str] = None, ) -> Dict[str, Any]: - """ - Run a complete conversation with tool calling until completion. - - Args: - user_message (str): The user's message/question - system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided) - conversation_history (List[Dict]): Previous conversation messages (optional) - task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided) - stream_callback: Optional callback invoked with each text delta during streaming. - Used by the TTS pipeline to start audio generation before the full response. - When None (default), API calls use the standard non-streaming path. - persist_user_message: Optional clean user message to store in - transcripts/history when user_message contains API-only - synthetic prefixes. - or queuing follow-up prefetch work. - - Returns: - Dict: Complete conversation result with final response and message history - """ - # Guard stdio against OSError from broken pipes (systemd/headless/daemon). - # Installed once, transparent when streams are healthy, prevents crash on write. - _install_safe_stdio() - - self._ensure_db_session() - - # Tell auxiliary_client what the live main provider/model are for - # this turn. Used by tools whose behaviour depends on the active - # main model (e.g. vision_analyze's native fast path) so they see - # the CLI/gateway override instead of the stale config.yaml - # default. Idempotent — fine to call every turn. - try: - from agent.auxiliary_client import set_runtime_main - set_runtime_main( - getattr(self, "provider", "") or "", - getattr(self, "model", "") or "", - ) - except Exception: - pass - - # Tag all log records on this thread with the session ID so - # ``hermes logs --session `` can filter a single conversation. - from hermes_logging import set_session_context - set_session_context(self.session_id) - - # Bind the skill write-origin ContextVar for this thread so tool - # handlers (e.g. skill_manage create) can tell whether they are - # running inside the background self-improvement review fork vs. - # a foreground user-directed turn. Set at the top of each call; - # the review fork runs on its own thread with a fresh context, - # so the foreground value here does not leak into it. - from tools.skill_provenance import set_current_write_origin - set_current_write_origin(getattr(self, "_memory_write_origin", "assistant_tool")) - - # If the previous turn activated fallback, restore the primary - # runtime so this turn gets a fresh attempt with the preferred model. - # No-op when _fallback_activated is False (gateway, first turn, etc.). - self._restore_primary_runtime() - - # Sanitize surrogate characters from user input. Clipboard paste from - # rich-text editors (Google Docs, Word, etc.) can inject lone surrogates - # that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK. - if isinstance(user_message, str): - user_message = _sanitize_surrogates(user_message) - if isinstance(persist_user_message, str): - persist_user_message = _sanitize_surrogates(persist_user_message) - - # Store stream callback for _interruptible_api_call to pick up - self._stream_callback = stream_callback - self._persist_user_message_idx = None - self._persist_user_message_override = persist_user_message - # Generate unique task_id if not provided to isolate VMs between concurrent tasks - effective_task_id = task_id or str(uuid.uuid4()) - # Expose the active task_id so tools running mid-turn (e.g. delegate_task - # in delegate_tool.py) can identify this agent for the cross-agent file - # state registry. Set BEFORE any tool dispatch so snapshots taken at - # child-launch time see the parent's real id, not None. - self._current_task_id = effective_task_id - - # Reset retry counters and iteration budget at the start of each turn - # so subagent usage from a previous turn doesn't eat into the next one. - self._invalid_tool_retries = 0 - self._invalid_json_retries = 0 - self._empty_content_retries = 0 - self._incomplete_scratchpad_retries = 0 - self._codex_incomplete_retries = 0 - self._thinking_prefill_retries = 0 - self._post_tool_empty_retried = False - self._last_content_with_tools = None - self._last_content_tools_all_housekeeping = False - self._mute_post_response = False - self._unicode_sanitization_passes = 0 - self._tool_guardrails.reset_for_turn() - self._tool_guardrail_halt_decision = None - # True until the server rejects an image_url content part with an error - # like "Only 'text' content type is supported." Set to False on first - # rejection and kept False for the rest of the session so we never re-send - # images to a text-only endpoint. Scoped per `_run()` call, not per instance. - self._vision_supported = True - - # Pre-turn connection health check: detect and clean up dead TCP - # connections left over from provider outages or dropped streams. - # This prevents the next API call from hanging on a zombie socket. - if self.api_mode != "anthropic_messages": - try: - if self._cleanup_dead_connections(): - self._emit_status( - "🔌 Detected stale connections from a previous provider " - "issue — cleaned up automatically. Proceeding with fresh " - "connection." - ) - except Exception: - pass - # Replay compression warning through status_callback for gateway - # platforms (the callback was not wired during __init__). - if self._compression_warning: - self._replay_compression_warning() - self._compression_warning = None # send once - - # NOTE: _turns_since_memory and _iters_since_skill are NOT reset here. - # They are initialized in __init__ and must persist across run_conversation - # calls so that nudge logic accumulates correctly in CLI mode. - self.iteration_budget = IterationBudget(self.max_iterations) - - # Log conversation turn start for debugging/observability - _preview_text = _summarize_user_message_for_log(user_message) - _msg_preview = (_preview_text[:80] + "...") if len(_preview_text) > 80 else _preview_text - _msg_preview = _msg_preview.replace("\n", " ") - logger.info( - "conversation turn: session=%s model=%s provider=%s platform=%s history=%d msg=%r", - self.session_id or "none", self.model, self.provider or "unknown", - self.platform or "unknown", len(conversation_history or []), - _msg_preview, - ) - - # Initialize conversation (copy to avoid mutating the caller's list) - messages = list(conversation_history) if conversation_history else [] - - # Hydrate todo store from conversation history (gateway creates a fresh - # AIAgent per message, so the in-memory store is empty -- we need to - # recover the todo state from the most recent todo tool response in history) - if conversation_history and not self._todo_store.has_items(): - self._hydrate_todo_store(conversation_history) - - # Hydrate per-session nudge counters from persisted history. - # Gateway creates a fresh AIAgent per inbound message (cache miss / - # 1h idle eviction / config-signature mismatch / process restart), so - # _turns_since_memory and _user_turn_count start at 0 every turn and - # the memory.nudge_interval trigger may never be reached. Reconstruct - # an effective count from prior user turns in conversation_history. - # Idempotent: a cached agent that already accumulated counters keeps - # them; only a freshly-built agent with empty in-memory state hydrates. - # See issue #22357. - if conversation_history and self._user_turn_count == 0: - prior_user_turns = sum( - 1 for m in conversation_history if m.get("role") == "user" - ) - if prior_user_turns > 0: - self._user_turn_count = prior_user_turns - if self._memory_nudge_interval > 0 and self._turns_since_memory == 0: - # % preserves original 1-in-N cadence rather than firing a - # review immediately on resume (which would surprise users - # whose session happened to land just past a multiple of N). - self._turns_since_memory = prior_user_turns % self._memory_nudge_interval - - - # Prefill messages (few-shot priming) are injected at API-call time only, - # never stored in the messages list. This keeps them ephemeral: they won't - # be saved to session DB, session logs, or batch trajectories, but they're - # automatically re-applied on every API call (including session continuations). - - # Track user turns for memory flush and periodic nudge logic - self._user_turn_count += 1 - - # Reset the streaming context scrubber at the top of each turn so a - # hung span from a prior interrupted stream can't taint this turn's - # output. - scrubber = getattr(self, "_stream_context_scrubber", None) - if scrubber is not None: - scrubber.reset() - # Reset the think scrubber for the same reason — an interrupted - # prior stream may have left us inside an unterminated block. - think_scrubber = getattr(self, "_stream_think_scrubber", None) - if think_scrubber is not None: - think_scrubber.reset() - - # Preserve the original user message (no nudge injection). - original_user_message = persist_user_message if persist_user_message is not None else user_message - - # Track memory nudge trigger (turn-based, checked here). - # Skill trigger is checked AFTER the agent loop completes, based on - # how many tool iterations THIS turn used. - _should_review_memory = False - if (self._memory_nudge_interval > 0 - and "memory" in self.valid_tool_names - and self._memory_store): - self._turns_since_memory += 1 - if self._turns_since_memory >= self._memory_nudge_interval: - _should_review_memory = True - self._turns_since_memory = 0 - - # Add user message - user_msg = {"role": "user", "content": user_message} - messages.append(user_msg) - current_turn_user_idx = len(messages) - 1 - self._persist_user_message_idx = current_turn_user_idx - - if not self.quiet_mode: - _print_preview = _summarize_user_message_for_log(user_message) - self._safe_print(f"💬 Starting conversation: '{_print_preview[:60]}{'...' if len(_print_preview) > 60 else ''}'") - - # ── System prompt (cached per session for prefix caching) ── - # Built once on first call, reused for all subsequent calls. - # Only rebuilt after context compression events (which invalidate - # the cache and reload memory from disk). - # - # For continuing sessions (gateway creates a fresh AIAgent per - # message), we load the stored system prompt from the session DB - # instead of rebuilding. Rebuilding would pick up memory changes - # from disk that the model already knows about (it wrote them!), - # producing a different system prompt and breaking the Anthropic - # prefix cache. - if self._cached_system_prompt is None: - stored_prompt = None - if conversation_history and self._session_db: - try: - session_row = self._session_db.get_session(self.session_id) - if session_row: - stored_prompt = session_row.get("system_prompt") or None - except Exception: - pass # Fall through to build fresh - - if stored_prompt: - # Continuing session — reuse the exact system prompt from - # the previous turn so the Anthropic cache prefix matches. - self._cached_system_prompt = stored_prompt - else: - # First turn of a new session — build from scratch. - self._cached_system_prompt = self._build_system_prompt(system_message) - # Plugin hook: on_session_start - # Fired once when a brand-new session is created (not on - # continuation). Plugins can use this to initialise - # session-scoped state (e.g. warm a memory cache). - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _invoke_hook( - "on_session_start", - session_id=self.session_id, - model=self.model, - platform=getattr(self, "platform", None) or "", - ) - except Exception as exc: - logger.warning("on_session_start hook failed: %s", exc) - - # Store the system prompt snapshot in SQLite - if self._session_db: - try: - self._session_db.update_system_prompt(self.session_id, self._cached_system_prompt) - except Exception as e: - logger.debug("Session DB update_system_prompt failed: %s", e) - - active_system_prompt = self._cached_system_prompt - - # ── Preflight context compression ── - # Before entering the main loop, check if the loaded conversation - # history already exceeds the model's context threshold. This handles - # cases where a user switches to a model with a smaller context window - # while having a large existing session — compress proactively rather - # than waiting for an API error (which might be caught as a non-retryable - # 4xx and abort the request entirely). - if ( - self.compression_enabled - and len(messages) > self.context_compressor.protect_first_n - + self.context_compressor.protect_last_n + 1 - ): - # Include tool schema tokens — with many tools these can add - # 20-30K+ tokens that the old sys+msg estimate missed entirely. - _preflight_tokens = estimate_request_tokens_rough( - messages, - system_prompt=active_system_prompt or "", - tools=self.tools or None, - ) - - if _preflight_tokens >= self.context_compressor.threshold_tokens: - logger.info( - "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)", - f"{_preflight_tokens:,}", - f"{self.context_compressor.threshold_tokens:,}", - self.model, - f"{self.context_compressor.context_length:,}", - ) - self._emit_status( - f"📦 Preflight compression: ~{_preflight_tokens:,} tokens " - f">= {self.context_compressor.threshold_tokens:,} threshold. " - "This may take a moment." - ) - # May need multiple passes for very large sessions with small - # context windows (each pass summarises the middle N turns). - for _pass in range(3): - _orig_len = len(messages) - messages, active_system_prompt = self._compress_context( - messages, system_message, approx_tokens=_preflight_tokens, - task_id=effective_task_id, - ) - if len(messages) >= _orig_len: - break # Cannot compress further - # Compression created a new session — clear the history - # reference so _flush_messages_to_session_db writes ALL - # compressed messages to the new session's SQLite, not - # skipping them because conversation_history is still the - # pre-compression length. - conversation_history = None - # Fix: reset retry counters after compression so the model - # gets a fresh budget on the compressed context. Without - # this, pre-compression retries carry over and the model - # hits "(empty)" immediately after compression-induced - # context loss. - self._empty_content_retries = 0 - self._thinking_prefill_retries = 0 - self._last_content_with_tools = None - self._last_content_tools_all_housekeeping = False - self._mute_post_response = False - # Re-estimate after compression - _preflight_tokens = estimate_request_tokens_rough( - messages, - system_prompt=active_system_prompt or "", - tools=self.tools or None, - ) - if _preflight_tokens < self.context_compressor.threshold_tokens: - break # Under threshold - - # Plugin hook: pre_llm_call - # Fired once per turn before the tool-calling loop. Plugins can - # return a dict with a ``context`` key (or a plain string) whose - # value is appended to the current turn's user message. - # - # Context is ALWAYS injected into the user message, never the - # system prompt. This preserves the prompt cache prefix — the - # system prompt stays identical across turns so cached tokens - # are reused. The system prompt is Hermes's territory; plugins - # contribute context alongside the user's input. - # - # All injected context is ephemeral (not persisted to session DB). - _plugin_user_context = "" - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _pre_results = _invoke_hook( - "pre_llm_call", - session_id=self.session_id, - user_message=original_user_message, - conversation_history=list(messages), - is_first_turn=(not bool(conversation_history)), - model=self.model, - platform=getattr(self, "platform", None) or "", - sender_id=getattr(self, "_user_id", None) or "", - ) - _ctx_parts: list[str] = [] - for r in _pre_results: - if isinstance(r, dict) and r.get("context"): - _ctx_parts.append(str(r["context"])) - elif isinstance(r, str) and r.strip(): - _ctx_parts.append(r) - if _ctx_parts: - _plugin_user_context = "\n\n".join(_ctx_parts) - except Exception as exc: - logger.warning("pre_llm_call hook failed: %s", exc) - - # Main conversation loop - api_call_count = 0 - final_response = None - interrupted = False - codex_ack_continuations = 0 - length_continue_retries = 0 - truncated_tool_call_retries = 0 - truncated_response_parts: List[str] = [] - compression_attempts = 0 - _turn_exit_reason = "unknown" # Diagnostic: why the loop ended - - # Per-turn file-mutation verifier state. Keyed by resolved path; - # each failed ``write_file`` / ``patch`` call records the error - # preview. Later successful writes to the same path remove the - # entry (the model recovered). At end-of-turn, any entries still - # present are surfaced in an advisory footer so the model cannot - # over-claim success while the file is actually unchanged on disk. - self._turn_failed_file_mutations: Dict[str, Dict[str, Any]] = {} - - # Record the execution thread so interrupt()/clear_interrupt() can - # scope the tool-level interrupt signal to THIS agent's thread only. - # Must be set before any thread-scoped interrupt syncing. - self._execution_thread_id = threading.current_thread().ident - - # Always clear stale per-thread state from a previous turn. If an - # interrupt arrived before startup finished, preserve it and bind it - # to this execution thread now instead of dropping it on the floor. - _set_interrupt(False, self._execution_thread_id) - if self._interrupt_requested: - _set_interrupt(True, self._execution_thread_id) - self._interrupt_thread_signal_pending = False - else: - self._interrupt_message = None - self._interrupt_thread_signal_pending = False - - # Notify memory providers of the new turn so cadence tracking works. - # Must happen BEFORE prefetch_all() so providers know which turn it is - # and can gate context/dialectic refresh via contextCadence/dialecticCadence. - if self._memory_manager: - try: - _turn_msg = original_user_message if isinstance(original_user_message, str) else "" - self._memory_manager.on_turn_start(self._user_turn_count, _turn_msg) - except Exception: - pass - - # External memory provider: prefetch once before the tool loop. - # Reuse the cached result on every iteration to avoid re-calling - # prefetch_all() on each tool call (10 tool calls = 10x latency + cost). - # Use original_user_message (clean input) — user_message may contain - # injected skill content that bloats / breaks provider queries. - _ext_prefetch_cache = "" - if self._memory_manager: - try: - _query = original_user_message if isinstance(original_user_message, str) else "" - _ext_prefetch_cache = self._memory_manager.prefetch_all(_query) or "" - except Exception: - pass - - # Optional opt-in runtime: if api_mode == codex_app_server, hand the - # turn to the codex app-server subprocess (terminal/file ops/patching - # all run inside Codex). Default Hermes path is bypassed entirely. - # See agent/transports/codex_app_server_session.py for the adapter - # and references/codex-app-server-runtime.md for the rationale. - if self.api_mode == "codex_app_server": - return self._run_codex_app_server_turn( - user_message=user_message, - original_user_message=original_user_message, - messages=messages, - effective_task_id=effective_task_id, - should_review_memory=_should_review_memory, - ) - - while (api_call_count < self.max_iterations and self.iteration_budget.remaining > 0) or self._budget_grace_call: - # Reset per-turn checkpoint dedup so each iteration can take one snapshot - self._checkpoint_mgr.new_turn() - - # Check for interrupt request (e.g., user sent new message) - if self._interrupt_requested: - interrupted = True - _turn_exit_reason = "interrupted_by_user" - if not self.quiet_mode: - self._safe_print("\n⚡ Breaking out of tool loop due to interrupt...") - break - - api_call_count += 1 - self._api_call_count = api_call_count - self._touch_activity(f"starting API call #{api_call_count}") - - # Grace call: the budget is exhausted but we gave the model one - # more chance. Consume the grace flag so the loop exits after - # this iteration regardless of outcome. - if self._budget_grace_call: - self._budget_grace_call = False - elif not self.iteration_budget.consume(): - _turn_exit_reason = "budget_exhausted" - if not self.quiet_mode: - self._safe_print(f"\n⚠️ Iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} iterations used)") - break - - # Fire step_callback for gateway hooks (agent:step event) - if self.step_callback is not None: - try: - prev_tools = [] - for _idx, _m in enumerate(reversed(messages)): - if _m.get("role") == "assistant" and _m.get("tool_calls"): - _fwd_start = len(messages) - _idx - _results_by_id = {} - for _tm in messages[_fwd_start:]: - if _tm.get("role") != "tool": - break - _tcid = _tm.get("tool_call_id") - if _tcid: - _results_by_id[_tcid] = _tm.get("content", "") - prev_tools = [ - { - "name": tc["function"]["name"], - "result": _results_by_id.get(tc.get("id")), - "arguments": tc["function"].get("arguments"), - } - for tc in _m["tool_calls"] - if isinstance(tc, dict) - ] - break - self.step_callback(api_call_count, prev_tools) - except Exception as _step_err: - logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err) - - # Track tool-calling iterations for skill nudge. - # Counter resets whenever skill_manage is actually used. - if (self._skill_nudge_interval > 0 - and "skill_manage" in self.valid_tool_names): - self._iters_since_skill += 1 - - # ── Pre-API-call /steer drain ────────────────────────────────── - # If a /steer arrived during the previous API call (while the model - # was thinking), drain it now — before we build api_messages — so - # the model sees the steer text on THIS iteration. Without this, - # steers sent during an API call only land after the NEXT tool batch, - # which may never come if the model returns a final response. - # - # We scan backwards for the last tool-role message in the messages - # list. If found, the steer is appended there. If not (first - # iteration, no tools yet), the steer stays pending for the next - # tool batch — injecting into a user message would break role - # alternation, and there's no tool output to piggyback on. - _pre_api_steer = self._drain_pending_steer() - if _pre_api_steer: - _injected = False - for _si in range(len(messages) - 1, -1, -1): - _sm = messages[_si] - if isinstance(_sm, dict) and _sm.get("role") == "tool": - marker = f"\n\nUser guidance: {_pre_api_steer}" - existing = _sm.get("content", "") - if isinstance(existing, str): - _sm["content"] = existing + marker - else: - # Multimodal content blocks — append text block - try: - blocks = list(existing) if existing else [] - blocks.append({"type": "text", "text": marker}) - _sm["content"] = blocks - except Exception: - pass - _injected = True - logger.debug( - "Pre-API-call steer drain: injected into tool msg at index %d", - _si, - ) - break - if not _injected: - # No tool message to inject into — put it back so - # the post-tool-execution drain picks it up later. - _lock = getattr(self, "_pending_steer_lock", None) - if _lock is not None: - with _lock: - if self._pending_steer: - self._pending_steer = self._pending_steer + "\n" + _pre_api_steer - else: - self._pending_steer = _pre_api_steer - else: - existing = getattr(self, "_pending_steer", None) - self._pending_steer = (existing + "\n" + _pre_api_steer) if existing else _pre_api_steer - - # Prepare messages for API call - # If we have an ephemeral system prompt, prepend it to the messages - # Note: Reasoning is embedded in content via tags for trajectory storage. - # However, providers like Moonshot AI require a separate 'reasoning_content' field - # on assistant messages with tool_calls. We handle both cases here. - request_logger = getattr(self, "logger", None) or logging.getLogger(__name__) - repaired_tool_calls = self._sanitize_tool_call_arguments( - messages, - logger=request_logger, - session_id=self.session_id, - ) - if repaired_tool_calls > 0: - request_logger.info( - "Sanitized %s corrupted tool_call arguments before request (session=%s)", - repaired_tool_calls, - self.session_id or "-", - ) - - # Defensive: repair malformed role-alternation before API call. - # Catches cases where the history got wedged into a - # ``tool → user`` or ``user → user`` tail (e.g. after empty- - # response scaffolding was stripped and a new user message - # landed after an orphan tool result). Most providers return - # empty content on malformed sequences, which would otherwise - # retrigger the empty-retry loop indefinitely. - repaired_seq = self._repair_message_sequence(messages) - if repaired_seq > 0: - request_logger.info( - "Repaired %s message-alternation violations before request (session=%s)", - repaired_seq, - self.session_id or "-", - ) - - api_messages = [] - for idx, msg in enumerate(messages): - api_msg = msg.copy() - - # Inject ephemeral context into the current turn's user message. - # Sources: memory manager prefetch + plugin pre_llm_call hooks - # with target="user_message" (the default). Both are - # API-call-time only — the original message in `messages` is - # never mutated, so nothing leaks into session persistence. - if idx == current_turn_user_idx and msg.get("role") == "user": - _injections = [] - if _ext_prefetch_cache: - _fenced = build_memory_context_block(_ext_prefetch_cache) - if _fenced: - _injections.append(_fenced) - if _plugin_user_context: - _injections.append(_plugin_user_context) - if _injections: - _base = api_msg.get("content", "") - if isinstance(_base, str): - api_msg["content"] = _base + "\n\n" + "\n\n".join(_injections) - - # For ALL assistant messages, pass reasoning back to the API - # This ensures multi-turn reasoning context is preserved - self._copy_reasoning_content_for_api(msg, api_msg) - - # Remove 'reasoning' field - it's for trajectory storage only - # We've copied it to 'reasoning_content' for the API above - if "reasoning" in api_msg: - api_msg.pop("reasoning") - # Remove finish_reason - not accepted by strict APIs (e.g. Mistral) - if "finish_reason" in api_msg: - api_msg.pop("finish_reason") - # Strip internal thinking-prefill marker - api_msg.pop("_thinking_prefill", None) - # Strip Codex Responses API fields (call_id, response_item_id) for - # strict providers like Mistral, Fireworks, etc. that reject unknown fields. - # Uses new dicts so the internal messages list retains the fields - # for Codex Responses compatibility. - if self._should_sanitize_tool_calls(): - self._sanitize_tool_calls_for_strict_api(api_msg) - # Keep 'reasoning_details' - OpenRouter uses this for multi-turn reasoning context - # The signature field helps maintain reasoning continuity - api_messages.append(api_msg) - - # Build the final system message: cached prompt + ephemeral system prompt. - # Ephemeral additions are API-call-time only (not persisted to session DB). - # External recall context is injected into the user message, not the system - # prompt, so the stable cache prefix remains unchanged. - # - # NOTE: Plugin context from pre_llm_call hooks is injected into the - # user message (see injection block above), NOT the system prompt. - # This is intentional — system prompt modifications break the prompt - # cache prefix. The system prompt is reserved for Hermes internals. - # - # Hermes invariant: the system prompt is built ONCE per session - # (cached on ``_cached_system_prompt``) and replayed verbatim on - # every turn. We send it as a single content string so the - # bytes are byte-stable across turns and upstream prompt caches - # stay warm. - effective_system = active_system_prompt or "" - if self.ephemeral_system_prompt: - effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip() - if effective_system: - api_messages = [{"role": "system", "content": effective_system}] + api_messages - - # Inject ephemeral prefill messages right after the system prompt - # but before conversation history. Same API-call-time-only pattern. - if self.prefill_messages: - sys_offset = 1 if (api_messages and api_messages[0].get("role") == "system") else 0 - for idx, pfm in enumerate(self.prefill_messages): - api_messages.insert(sys_offset + idx, pfm.copy()) - - # Apply Anthropic prompt caching for Claude models on native - # Anthropic, OpenRouter, and third-party Anthropic-compatible - # gateways. Auto-detected: if ``_use_prompt_caching`` is set, - # inject cache_control breakpoints (system + last 3 messages) - # to reduce input token costs by ~75% on multi-turn - # conversations. - if self._use_prompt_caching: - api_messages = apply_anthropic_cache_control( - api_messages, - cache_ttl=self._cache_ttl, - native_anthropic=self._use_native_cache_layout, - ) - - # Safety net: strip orphaned tool results / add stubs for missing - # results before sending to the API. Runs unconditionally — not - # gated on context_compressor — so orphans from session loading or - # manual message manipulation are always caught. - api_messages = self._sanitize_api_messages(api_messages) - - # Drop thinking-only assistant turns (reasoning but no visible - # output and no tool_calls) and merge any adjacent user messages - # left behind. Prevents Anthropic 400s ("The final block in an - # assistant message cannot be `thinking`.") and equivalent errors - # from third-party Anthropic-compatible gateways that can't replay - # a thinking-only turn. Runs on the per-call copy only — the - # stored conversation history keeps the reasoning block for the - # UI transcript and session persistence. - api_messages = self._drop_thinking_only_and_merge_users(api_messages) - - # Normalize message whitespace and tool-call JSON for consistent - # prefix matching. Ensures bit-perfect prefixes across turns, - # which enables KV cache reuse on local inference servers - # (llama.cpp, vLLM, Ollama) and improves cache hit rates for - # cloud providers. Operates on api_messages (the API copy) so - # the original conversation history in `messages` is untouched. - for am in api_messages: - if isinstance(am.get("content"), str): - am["content"] = am["content"].strip() - for am in api_messages: - tcs = am.get("tool_calls") - if not tcs: - continue - new_tcs = [] - for tc in tcs: - if isinstance(tc, dict) and "function" in tc: - try: - args_obj = json.loads(tc["function"]["arguments"]) - tc = {**tc, "function": { - **tc["function"], - "arguments": json.dumps( - args_obj, separators=(",", ":"), - sort_keys=True, - ), - }} - except Exception: - tc["function"]["arguments"] = _repair_tool_call_arguments( - tc["function"]["arguments"], - tc["function"].get("name", "?"), - ) - new_tcs.append(tc) - am["tool_calls"] = new_tcs - - # Proactively strip any surrogate characters before the API call. - # Models served via Ollama (Kimi K2.5, GLM-5, Qwen) can return - # lone surrogates (U+D800-U+DFFF) that crash json.dumps() inside - # the OpenAI SDK. Sanitizing here prevents the 3-retry cycle. - _sanitize_messages_surrogates(api_messages) - - # Calculate approximate request size for logging - total_chars = sum(len(str(msg)) for msg in api_messages) - approx_tokens = estimate_messages_tokens_rough(api_messages) - - # Thinking spinner for quiet mode (animated during API call) - thinking_spinner = None - - if not self.quiet_mode: - self._vprint(f"\n{self.log_prefix}🔄 Making API call #{api_call_count}/{self.max_iterations}...") - self._vprint(f"{self.log_prefix} 📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)") - self._vprint(f"{self.log_prefix} 🔧 Available tools: {len(self.tools) if self.tools else 0}") - else: - # Animated thinking spinner in quiet mode - face = random.choice(KawaiiSpinner.get_thinking_faces()) - verb = random.choice(KawaiiSpinner.get_thinking_verbs()) - if self.thinking_callback: - # CLI TUI mode: use prompt_toolkit widget instead of raw spinner - # (works in both streaming and non-streaming modes) - self.thinking_callback(f"{face} {verb}...") - elif not self._has_stream_consumers() and self._should_start_quiet_spinner(): - # Raw KawaiiSpinner only when no streaming consumers and the - # spinner output has a safe sink. - spinner_type = random.choice(['brain', 'sparkle', 'pulse', 'moon', 'star']) - thinking_spinner = KawaiiSpinner(f"{face} {verb}...", spinner_type=spinner_type, print_fn=self._print_fn) - thinking_spinner.start() - - # Log request details if verbose - if self.verbose_logging: - logging.debug(f"API Request - Model: {self.model}, Messages: {len(messages)}, Tools: {len(self.tools) if self.tools else 0}") - logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}") - logging.debug(f"Total message size: ~{approx_tokens:,} tokens") - - api_start_time = time.time() - retry_count = 0 - max_retries = self._api_max_retries - primary_recovery_attempted = False - max_compression_attempts = 3 - codex_auth_retry_attempted=False - anthropic_auth_retry_attempted=False - nous_auth_retry_attempted=False - copilot_auth_retry_attempted=False - thinking_sig_retry_attempted = False - image_shrink_retry_attempted = False - oauth_1m_beta_retry_attempted = False - llama_cpp_grammar_retry_attempted = False - has_retried_429 = False - restart_with_compressed_messages = False - restart_with_length_continuation = False - - finish_reason = "stop" - response = None # Guard against UnboundLocalError if all retries fail - api_kwargs = None # Guard against UnboundLocalError in except handler - - while retry_count < max_retries: - # ── Nous Portal rate limit guard ────────────────────── - # If another session already recorded that Nous is rate- - # limited, skip the API call entirely. Each attempt - # (including SDK-level retries) counts against RPH and - # deepens the rate limit hole. - if self.provider == "nous": - try: - from agent.nous_rate_guard import ( - nous_rate_limit_remaining, - format_remaining as _fmt_nous_remaining, - ) - _nous_remaining = nous_rate_limit_remaining() - if _nous_remaining is not None and _nous_remaining > 0: - _nous_msg = ( - f"Nous Portal rate limit active — " - f"resets in {_fmt_nous_remaining(_nous_remaining)}." - ) - self._vprint( - f"{self.log_prefix}⏳ {_nous_msg} Trying fallback...", - force=True, - ) - self._emit_status(f"⏳ {_nous_msg}") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - # No fallback available — return with clear message - self._persist_session(messages, conversation_history) - return { - "final_response": ( - f"⏳ {_nous_msg}\n\n" - "No fallback provider available. " - "Try again after the reset, or add a " - "fallback provider in config.yaml." - ), - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "failed": True, - "error": _nous_msg, - } - except ImportError: - pass - except Exception: - pass # Never let rate guard break the agent loop - - try: - self._reset_stream_delivery_tracking() - api_kwargs = self._build_api_kwargs(api_messages) - if self._force_ascii_payload: - _sanitize_structure_non_ascii(api_kwargs) - if self.api_mode == "codex_responses": - api_kwargs = self._get_transport().preflight_kwargs(api_kwargs, allow_stream=False) - - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - request_messages = api_kwargs.get("messages") - if not isinstance(request_messages, list): - request_messages = api_kwargs.get("input") - if not isinstance(request_messages, list): - request_messages = api_messages - # Shallow-copy the outer list so plugins that retain the - # reference for async snapshotting don't observe later - # mutations of api_messages. The inner dicts are not - # mutated by the agent loop, so a shallow copy is - # sufficient; a deepcopy would walk every tool result - # and base64 image on every API call. - _invoke_hook( - "pre_api_request", - task_id=effective_task_id, - session_id=self.session_id or "", - user_message=original_user_message, - conversation_history=list(messages), - platform=self.platform or "", - model=self.model, - provider=self.provider, - base_url=self.base_url, - api_mode=self.api_mode, - api_call_count=api_call_count, - request_messages=list(request_messages) if isinstance(request_messages, list) else [], - message_count=len(api_messages), - tool_count=len(self.tools or []), - approx_input_tokens=approx_tokens, - request_char_count=total_chars, - max_tokens=self.max_tokens, - ) - except Exception: - pass - - if env_var_enabled("HERMES_DUMP_REQUESTS"): - self._dump_api_request_debug(api_kwargs, reason="preflight") - - # Always prefer the streaming path — even without stream - # consumers. Streaming gives us fine-grained health - # checking (90s stale-stream detection, 60s read timeout) - # that the non-streaming path lacks. Without this, - # subagents and other quiet-mode callers can hang - # indefinitely when the provider keeps the connection - # alive with SSE pings but never delivers a response. - # The streaming path is a no-op for callbacks when no - # consumers are registered, and falls back to non- - # streaming automatically if the provider doesn't - # support it. - def _stop_spinner(): - nonlocal thinking_spinner - if thinking_spinner: - thinking_spinner.stop("") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - - _use_streaming = True - # Provider signaled "stream not supported" on a previous - # attempt — switch to non-streaming for the rest of this - # session instead of re-failing every retry. - if getattr(self, "_disable_streaming", False): - _use_streaming = False - # CopilotACPClient communicates via subprocess stdio and - # returns a plain SimpleNamespace — not an iterable - # stream. Mirror the ACP exclusion used for Responses - # API upgrade (lines ~1083-1085). - elif ( - self.provider == "copilot-acp" - or str(self.base_url or "").lower().startswith("acp://copilot") - or str(self.base_url or "").lower().startswith("acp+tcp://") - ): - _use_streaming = False - elif not self._has_stream_consumers(): - # No display/TTS consumer. Still prefer streaming for - # health checking, but skip for Mock clients in tests - # (mocks return SimpleNamespace, not stream iterators). - from unittest.mock import Mock - if isinstance(getattr(self, "client", None), Mock): - _use_streaming = False - - if _use_streaming: - response = self._interruptible_streaming_api_call( - api_kwargs, on_first_delta=_stop_spinner - ) - else: - response = self._interruptible_api_call(api_kwargs) - - api_duration = time.time() - api_start_time - - # Stop thinking spinner silently -- the response box or tool - # execution messages that follow are more informative. - if thinking_spinner: - thinking_spinner.stop("") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - - if not self.quiet_mode: - self._vprint(f"{self.log_prefix}⏱️ API call completed in {api_duration:.2f}s") - - if self.verbose_logging: - # Log response with provider info if available - resp_model = getattr(response, 'model', 'N/A') if response else 'N/A' - logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}") - - # Validate response shape before proceeding - response_invalid = False - error_details = [] - if self.api_mode == "codex_responses": - _ct_v = self._get_transport() - if not _ct_v.validate_response(response): - if response is None: - response_invalid = True - error_details.append("response is None") - else: - # Provider returned a terminal failure (e.g. quota exhaustion). - # Treat as invalid so the fallback chain is triggered instead of - # letting the error bubble up outside the retry/fallback loop. - _codex_resp_status = str(getattr(response, "status", "") or "").strip().lower() - if _codex_resp_status in {"failed", "cancelled"}: - _codex_error_obj = getattr(response, "error", None) - _codex_error_msg = ( - _codex_error_obj.get("message") if isinstance(_codex_error_obj, dict) - else str(_codex_error_obj) if _codex_error_obj - else f"Responses API returned status '{_codex_resp_status}'" - ) - logging.warning( - "Codex response status='%s' (error=%s). Routing to fallback. %s", - _codex_resp_status, _codex_error_msg, - self._client_log_context(), - ) - response_invalid = True - error_details.append(f"response.status={_codex_resp_status}: {_codex_error_msg}") - else: - # output_text fallback: stream backfill may have failed - # but normalize can still recover from output_text - _out_text = getattr(response, "output_text", None) - _out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else "" - if _out_text_stripped: - logger.debug( - "Codex response.output is empty but output_text is present " - "(%d chars); deferring to normalization.", - len(_out_text_stripped), - ) - else: - _resp_status = getattr(response, "status", None) - _resp_incomplete = getattr(response, "incomplete_details", None) - logger.warning( - "Codex response.output is empty after stream backfill " - "(status=%s, incomplete_details=%s, model=%s). %s", - _resp_status, _resp_incomplete, - getattr(response, "model", None), - f"api_mode={self.api_mode} provider={self.provider}", - ) - response_invalid = True - error_details.append("response.output is empty") - elif self.api_mode == "anthropic_messages": - _tv = self._get_transport() - if not _tv.validate_response(response): - response_invalid = True - if response is None: - error_details.append("response is None") - else: - error_details.append("response.content invalid (not a non-empty list)") - elif self.api_mode == "bedrock_converse": - _btv = self._get_transport() - if not _btv.validate_response(response): - response_invalid = True - if response is None: - error_details.append("response is None") - else: - error_details.append("Bedrock response invalid (no output or choices)") - else: - _ctv = self._get_transport() - if not _ctv.validate_response(response): - response_invalid = True - if response is None: - error_details.append("response is None") - elif not hasattr(response, 'choices'): - error_details.append("response has no 'choices' attribute") - elif response.choices is None: - error_details.append("response.choices is None") - else: - error_details.append("response.choices is empty") - - if response_invalid: - # Stop spinner before printing error messages - if thinking_spinner: - thinking_spinner.stop("(´;ω;`) oops, retrying...") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - - # Invalid response — could be rate limiting, provider timeout, - # upstream server error, or malformed response. - retry_count += 1 - - # Eager fallback: empty/malformed responses are a common - # rate-limit symptom. Switch to fallback immediately - # rather than retrying with extended backoff. - if self._fallback_index < len(self._fallback_chain): - self._emit_status("⚠️ Empty/malformed response — switching to fallback...") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - - # Check for error field in response (some providers include this) - error_msg = "Unknown" - provider_name = "Unknown" - if response and hasattr(response, 'error') and response.error: - error_msg = str(response.error) - # Try to extract provider from error metadata - if hasattr(response.error, 'metadata') and response.error.metadata: - provider_name = response.error.metadata.get('provider_name', 'Unknown') - elif response and hasattr(response, 'message') and response.message: - error_msg = str(response.message) - - # Try to get provider from model field (OpenRouter often returns actual model used) - if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model: - provider_name = f"model={response.model}" - - # Check for x-openrouter-provider or similar metadata - if provider_name == "Unknown" and response: - # Log all response attributes for debugging - resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')} - if self.verbose_logging: - logging.debug(f"Response attributes for invalid response: {resp_attrs}") - - # Extract error code from response for contextual diagnostics - _resp_error_code = None - if response and hasattr(response, 'error') and response.error: - _code_raw = getattr(response.error, 'code', None) - if _code_raw is None and isinstance(response.error, dict): - _code_raw = response.error.get('code') - if _code_raw is not None: - try: - _resp_error_code = int(_code_raw) - except (TypeError, ValueError): - pass - - # Build a human-readable failure hint from the error code - # and response time, instead of always assuming rate limiting. - if _resp_error_code == 524: - _failure_hint = f"upstream provider timed out (Cloudflare 524, {api_duration:.0f}s)" - elif _resp_error_code == 504: - _failure_hint = f"upstream gateway timeout (504, {api_duration:.0f}s)" - elif _resp_error_code == 429: - _failure_hint = f"rate limited by upstream provider (429)" - elif _resp_error_code in {500, 502}: - _failure_hint = f"upstream server error ({_resp_error_code}, {api_duration:.0f}s)" - elif _resp_error_code in {503, 529}: - _failure_hint = f"upstream provider overloaded ({_resp_error_code})" - elif _resp_error_code is not None: - _failure_hint = f"upstream error (code {_resp_error_code}, {api_duration:.0f}s)" - elif api_duration < 10: - _failure_hint = f"fast response ({api_duration:.1f}s) — likely rate limited" - elif api_duration > 60: - _failure_hint = f"slow response ({api_duration:.0f}s) — likely upstream timeout" - else: - _failure_hint = f"response time {api_duration:.1f}s" - - self._vprint(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True) - self._vprint(f"{self.log_prefix} 🏢 Provider: {provider_name}", force=True) - cleaned_provider_error = self._clean_error_message(error_msg) - self._vprint(f"{self.log_prefix} 📝 Provider message: {cleaned_provider_error}", force=True) - self._vprint(f"{self.log_prefix} ⏱️ {_failure_hint}", force=True) - - if retry_count >= max_retries: - # Try fallback before giving up - self._emit_status(f"⚠️ Max retries ({max_retries}) for invalid responses — trying fallback...") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - self._emit_status(f"❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.") - logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Invalid API response after {max_retries} retries: {_failure_hint}", - "failed": True # Mark as failure for filtering - } - - # Backoff before retry — jittered exponential: 5s base, 120s cap - wait_time = jittered_backoff(retry_count, base_delay=5.0, max_delay=120.0) - self._vprint(f"{self.log_prefix}⏳ Retrying in {wait_time:.1f}s ({_failure_hint})...", force=True) - logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}") - - # Sleep in small increments to stay responsive to interrupts - sleep_end = time.time() + wait_time - _backoff_touch_counter = 0 - while time.time() < sleep_end: - if self._interrupt_requested: - self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True) - self._persist_session(messages, conversation_history) - self.clear_interrupt() - return { - "final_response": f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries}).", - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "interrupted": True, - } - time.sleep(0.2) - # Touch activity every ~30s so the gateway's inactivity - # monitor knows we're alive during backoff waits. - _backoff_touch_counter += 1 - if _backoff_touch_counter % 150 == 0: # 150 × 0.2s = 30s - self._touch_activity( - f"retry backoff ({retry_count}/{max_retries}), " - f"{int(sleep_end - time.time())}s remaining" - ) - continue # Retry the API call - - # Check finish_reason before proceeding - if self.api_mode == "codex_responses": - status = getattr(response, "status", None) - incomplete_details = getattr(response, "incomplete_details", None) - incomplete_reason = None - if isinstance(incomplete_details, dict): - incomplete_reason = incomplete_details.get("reason") - else: - incomplete_reason = getattr(incomplete_details, "reason", None) - if status == "incomplete" and incomplete_reason in {"max_output_tokens", "length"}: - finish_reason = "length" - else: - finish_reason = "stop" - elif self.api_mode == "anthropic_messages": - _tfr = self._get_transport() - finish_reason = _tfr.map_finish_reason(response.stop_reason) - elif self.api_mode == "bedrock_converse": - # Bedrock response already normalized at dispatch — use transport - _bt_fr = self._get_transport() - _bedrock_result = _bt_fr.normalize_response(response) - finish_reason = _bedrock_result.finish_reason - else: - _cc_fr = self._get_transport() - _finish_result = _cc_fr.normalize_response(response) - finish_reason = _finish_result.finish_reason - assistant_message = _finish_result - if self._should_treat_stop_as_truncated( - finish_reason, - assistant_message, - messages, - ): - self._vprint( - f"{self.log_prefix}⚠️ Treating suspicious Ollama/GLM stop response as truncated", - force=True, - ) - finish_reason = "length" - - if finish_reason == "length": - self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True) - - # Normalize the truncated response to a single OpenAI-style - # message shape so text-continuation and tool-call retry - # work uniformly across chat_completions, bedrock_converse, - # and anthropic_messages. For Anthropic we use the same - # adapter the agent loop already relies on so the rebuilt - # interim assistant message is byte-identical to what - # would have been appended in the non-truncated path. - _trunc_msg = None - _trunc_transport = self._get_transport() - if self.api_mode == "anthropic_messages": - _trunc_result = _trunc_transport.normalize_response( - response, strip_tool_prefix=self._is_anthropic_oauth - ) - else: - _trunc_result = _trunc_transport.normalize_response(response) - _trunc_msg = _trunc_result - - _trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None - _trunc_has_tool_calls = bool(getattr(_trunc_msg, "tool_calls", None)) if _trunc_msg else False - - # ── Detect thinking-budget exhaustion ────────────── - # When the model spends ALL output tokens on reasoning - # and has none left for the response, continuation - # retries are pointless. Detect this early and give a - # targeted error instead of wasting 3 API calls. - # A response is "thinking exhausted" only when the model - # actually produced reasoning blocks but no visible text after - # them. Models that do not use tags (e.g. GLM-4.7 on - # NVIDIA Build, minimax) may return content=None or an empty - # string for unrelated reasons — treat those as normal - # truncations that deserve continuation retries, not as - # thinking-budget exhaustion. - _has_think_tags = bool( - _trunc_content and re.search( - r'<(?:think|thinking|reasoning|REASONING_SCRATCHPAD)[^>]*>', - _trunc_content, - re.IGNORECASE, - ) - ) - _thinking_exhausted = ( - not _trunc_has_tool_calls - and _has_think_tags - and ( - (_trunc_content is not None and not self._has_content_after_think_block(_trunc_content)) - or _trunc_content is None - ) - ) - - if _thinking_exhausted: - _exhaust_error = ( - "Model used all output tokens on reasoning with none left " - "for the response. Try lowering reasoning effort or " - "increasing max_tokens." - ) - self._vprint( - f"{self.log_prefix}💭 Reasoning exhausted the output token budget — " - f"no visible response was produced.", - force=True, - ) - # Return a user-friendly message as the response so - # CLI (response box) and gateway (chat message) both - # display it naturally instead of a suppressed error. - _exhaust_response = ( - "⚠️ **Thinking Budget Exhausted**\n\n" - "The model used all its output tokens on reasoning " - "and had none left for the actual response.\n\n" - "To fix this:\n" - "→ Lower reasoning effort: `/thinkon low` or `/thinkon minimal`\n" - "→ Or switch to a larger/non-reasoning model with `/model`" - ) - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - return { - "final_response": _exhaust_response, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": _exhaust_error, - } - - if self.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}: - assistant_message = _trunc_msg - if assistant_message is not None and not _trunc_has_tool_calls: - length_continue_retries += 1 - interim_msg = self._build_assistant_message(assistant_message, finish_reason) - messages.append(interim_msg) - if assistant_message.content: - truncated_response_parts.append(assistant_message.content) - - if length_continue_retries < 3: - self._vprint( - f"{self.log_prefix}↻ Requesting continuation " - f"({length_continue_retries}/3)..." - ) - continue_msg = { - "role": "user", - "content": ( - "[System: Your previous response was truncated by the output " - "length limit. Continue exactly where you left off. Do not " - "restart or repeat prior text. Finish the answer directly.]" - ), - } - messages.append(continue_msg) - self._session_messages = messages - self._save_session_log(messages) - restart_with_length_continuation = True - break - - partial_response = self._strip_think_blocks("".join(truncated_response_parts)).strip() - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - return { - "final_response": partial_response or None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Response remained truncated after 3 continuation attempts", - } - - if self.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}: - assistant_message = _trunc_msg - if assistant_message is not None and _trunc_has_tool_calls: - if truncated_tool_call_retries < 1: - truncated_tool_call_retries += 1 - self._vprint( - f"{self.log_prefix}⚠️ Truncated tool call detected — retrying API call...", - force=True, - ) - # Don't append the broken response to messages; - # just re-run the same API call from the current - # message state, giving the model another chance. - continue - self._vprint( - f"{self.log_prefix}⚠️ Truncated tool call response detected again — refusing to execute incomplete tool arguments.", - force=True, - ) - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Response truncated due to output length limit", - } - - # If we have prior messages, roll back to last complete state - if len(messages) > 1: - self._vprint(f"{self.log_prefix} ⏪ Rolling back to last complete assistant turn") - rolled_back_messages = self._get_messages_up_to_last_assistant(messages) - - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - - return { - "final_response": None, - "messages": rolled_back_messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Response truncated due to output length limit" - } - else: - # First message was truncated - mark as failed - self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover", force=True) - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "failed": True, - "error": "First response truncated due to output length limit" - } - - # Track actual token usage from response for context management - if hasattr(response, 'usage') and response.usage: - canonical_usage = normalize_usage( - response.usage, - provider=self.provider, - api_mode=self.api_mode, - ) - prompt_tokens = canonical_usage.prompt_tokens - completion_tokens = canonical_usage.output_tokens - total_tokens = canonical_usage.total_tokens - usage_dict = { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": total_tokens, - } - self.context_compressor.update_from_response(usage_dict) - - # Cache discovered context length after successful call. - # Only persist limits confirmed by the provider (parsed - # from the error message), not guessed probe tiers. - if getattr(self.context_compressor, "_context_probed", False): - ctx = self.context_compressor.context_length - if getattr(self.context_compressor, "_context_probe_persistable", False): - save_context_length(self.model, self.base_url, ctx) - self._safe_print(f"{self.log_prefix}💾 Cached context length: {ctx:,} tokens for {self.model}") - self.context_compressor._context_probed = False - self.context_compressor._context_probe_persistable = False - - self.session_prompt_tokens += prompt_tokens - self.session_completion_tokens += completion_tokens - self.session_total_tokens += total_tokens - self.session_api_calls += 1 - self.session_input_tokens += canonical_usage.input_tokens - self.session_output_tokens += canonical_usage.output_tokens - self.session_cache_read_tokens += canonical_usage.cache_read_tokens - self.session_cache_write_tokens += canonical_usage.cache_write_tokens - self.session_reasoning_tokens += canonical_usage.reasoning_tokens - - # Log API call details for debugging/observability - _cache_pct = "" - if canonical_usage.cache_read_tokens and prompt_tokens: - _cache_pct = f" cache={canonical_usage.cache_read_tokens}/{prompt_tokens} ({100*canonical_usage.cache_read_tokens/prompt_tokens:.0f}%)" - logger.info( - "API call #%d: model=%s provider=%s in=%d out=%d total=%d latency=%.1fs%s", - self.session_api_calls, self.model, self.provider or "unknown", - prompt_tokens, completion_tokens, total_tokens, - api_duration, _cache_pct, - ) - - cost_result = estimate_usage_cost( - self.model, - canonical_usage, - provider=self.provider, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - ) - if cost_result.amount_usd is not None: - self.session_estimated_cost_usd += float(cost_result.amount_usd) - self.session_cost_status = cost_result.status - self.session_cost_source = cost_result.source - - # Persist token counts to session DB for /insights. - # Do this for every platform with a session_id so non-CLI - # sessions (gateway, cron, delegated runs) cannot lose - # token/accounting data if a higher-level persistence path - # is skipped or fails. Gateway/session-store writes use - # absolute totals, so they safely overwrite these per-call - # deltas instead of double-counting them. - if self._session_db and self.session_id: - try: - # Ensure the session row exists before attempting UPDATE. - # Under concurrent load (cron/kanban), the initial - # _ensure_db_session() may have failed due to SQLite - # locking. Retry here so per-call token deltas are - # not silently lost (UPDATE on a non-existent row - # affects 0 rows without error). - if not self._session_db_created: - self._ensure_db_session() - self._session_db.update_token_counts( - self.session_id, - input_tokens=canonical_usage.input_tokens, - output_tokens=canonical_usage.output_tokens, - cache_read_tokens=canonical_usage.cache_read_tokens, - cache_write_tokens=canonical_usage.cache_write_tokens, - reasoning_tokens=canonical_usage.reasoning_tokens, - estimated_cost_usd=float(cost_result.amount_usd) - if cost_result.amount_usd is not None else None, - cost_status=cost_result.status, - cost_source=cost_result.source, - billing_provider=self.provider, - billing_base_url=self.base_url, - billing_mode="subscription_included" - if cost_result.status == "included" else None, - model=self.model, - api_call_count=1, - ) - except Exception as e: - # Log token persistence failures so they're - # visible in agent.log — silent loss here is - # the root cause of undercounted analytics. - logger.debug( - "Token persistence failed (session=%s, tokens=%d): %s", - self.session_id, total_tokens, e, - ) - - if self.verbose_logging: - logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}") - - # Surface cache hit stats for any provider that reports - # them — not just those where we inject cache_control - # markers. OpenAI/Kimi/DeepSeek/Qwen all do automatic - # server-side prefix caching and return - # ``prompt_tokens_details.cached_tokens``; users - # previously could not see their cache % because this - # line was gated on ``_use_prompt_caching``, which is - # only True for Anthropic-style marker injection. - # ``canonical_usage`` is already normalised from all - # three API shapes (Anthropic / Codex / OpenAI-chat) - # so we can rely on its values directly. - cached = canonical_usage.cache_read_tokens - written = canonical_usage.cache_write_tokens - prompt = usage_dict["prompt_tokens"] - if (cached or written) and not self.quiet_mode: - hit_pct = (cached / prompt * 100) if prompt > 0 else 0 - self._vprint( - f"{self.log_prefix} 💾 Cache: " - f"{cached:,}/{prompt:,} tokens " - f"({hit_pct:.0f}% hit, {written:,} written)" - ) - - has_retried_429 = False # Reset on success - # Clear Nous rate limit state on successful request — - # proves the limit has reset and other sessions can - # resume hitting Nous. - if self.provider == "nous": - try: - from agent.nous_rate_guard import clear_nous_rate_limit - clear_nous_rate_limit() - except Exception: - pass - self._touch_activity(f"API call #{api_call_count} completed") - break # Success, exit retry loop - - except InterruptedError: - if thinking_spinner: - thinking_spinner.stop("") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - api_elapsed = time.time() - api_start_time - self._vprint(f"{self.log_prefix}⚡ Interrupted during API call.", force=True) - self._persist_session(messages, conversation_history) - interrupted = True - final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)." - break - - except Exception as api_error: - # Stop spinner before printing error messages - if thinking_spinner: - thinking_spinner.stop("(╥_╥) error, retrying...") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - - # ----------------------------------------------------------- - # UnicodeEncodeError recovery. Two common causes: - # 1. Lone surrogates (U+D800..U+DFFF) from clipboard paste - # (Google Docs, rich-text editors) — sanitize and retry. - # 2. ASCII codec on systems with LANG=C or non-UTF-8 locale - # (e.g. Chromebooks) — any non-ASCII character fails. - # Detect via the error message mentioning 'ascii' codec. - # We sanitize messages in-place and may retry twice: - # first to strip surrogates, then once more for pure - # ASCII-only locale sanitization if needed. - # ----------------------------------------------------------- - if isinstance(api_error, UnicodeEncodeError) and getattr(self, '_unicode_sanitization_passes', 0) < 2: - _err_str = str(api_error).lower() - _is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str - # Detect surrogate errors — utf-8 codec refusing to - # encode U+D800..U+DFFF. The error text is: - # "'utf-8' codec can't encode characters in position - # N-M: surrogates not allowed" - _is_surrogate_error = ( - "surrogate" in _err_str - or ("'utf-8'" in _err_str and not _is_ascii_codec) - ) - # Sanitize surrogates from both the canonical `messages` - # list AND `api_messages` (the API-copy, which may carry - # `reasoning_content`/`reasoning_details` transformed - # from `reasoning` — fields the canonical list doesn't - # have directly). Also clean `api_kwargs` if built and - # `prefill_messages` if present. Mirrors the ASCII - # codec recovery below. - _surrogates_found = _sanitize_messages_surrogates(messages) - if isinstance(api_messages, list): - if _sanitize_messages_surrogates(api_messages): - _surrogates_found = True - if isinstance(api_kwargs, dict): - if _sanitize_structure_surrogates(api_kwargs): - _surrogates_found = True - if isinstance(getattr(self, "prefill_messages", None), list): - if _sanitize_messages_surrogates(self.prefill_messages): - _surrogates_found = True - # Gate the retry on the error type, not on whether we - # found anything — _force_ascii_payload / the extended - # surrogate walker above cover all known paths, but a - # new transformed field could still slip through. If - # the error was a surrogate encode failure, always let - # the retry run; the proactive sanitizer at line ~8781 - # runs again on the next iteration. Bounded by - # _unicode_sanitization_passes < 2 (outer guard). - if _surrogates_found or _is_surrogate_error: - self._unicode_sanitization_passes += 1 - if _surrogates_found: - self._vprint( - f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...", - force=True, - ) - else: - self._vprint( - f"{self.log_prefix}⚠️ Surrogate encoding error — retrying after full-payload sanitization...", - force=True, - ) - continue - if _is_ascii_codec: - self._force_ascii_payload = True - # ASCII codec: the system encoding can't handle - # non-ASCII characters at all. Sanitize all - # non-ASCII content from messages/tool schemas and retry. - # Sanitize both the canonical `messages` list and - # `api_messages` (the API-copy built before the retry - # loop, which may contain extra fields like - # reasoning_content that are not in `messages`). - _messages_sanitized = _sanitize_messages_non_ascii(messages) - if isinstance(api_messages, list): - _sanitize_messages_non_ascii(api_messages) - # Also sanitize the last api_kwargs if already built, - # so a leftover non-ASCII value in a transformed field - # (e.g. extra_body, reasoning_content) doesn't survive - # into the next attempt via _build_api_kwargs cache paths. - if isinstance(api_kwargs, dict): - _sanitize_structure_non_ascii(api_kwargs) - _prefill_sanitized = False - if isinstance(getattr(self, "prefill_messages", None), list): - _prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages) - - _tools_sanitized = False - if isinstance(getattr(self, "tools", None), list): - _tools_sanitized = _sanitize_tools_non_ascii(self.tools) - - _system_sanitized = False - if isinstance(active_system_prompt, str): - _sanitized_system = _strip_non_ascii(active_system_prompt) - if _sanitized_system != active_system_prompt: - active_system_prompt = _sanitized_system - self._cached_system_prompt = _sanitized_system - _system_sanitized = True - if isinstance(getattr(self, "ephemeral_system_prompt", None), str): - _sanitized_ephemeral = _strip_non_ascii(self.ephemeral_system_prompt) - if _sanitized_ephemeral != self.ephemeral_system_prompt: - self.ephemeral_system_prompt = _sanitized_ephemeral - _system_sanitized = True - - _headers_sanitized = False - _default_headers = ( - self._client_kwargs.get("default_headers") - if isinstance(getattr(self, "_client_kwargs", None), dict) - else None - ) - if isinstance(_default_headers, dict): - _headers_sanitized = _sanitize_structure_non_ascii(_default_headers) - - # Sanitize the API key — non-ASCII characters in - # credentials (e.g. ʋ instead of v from a bad - # copy-paste) cause httpx to fail when encoding - # the Authorization header as ASCII. This is the - # most common cause of persistent UnicodeEncodeError - # that survives message/tool sanitization (#6843). - _credential_sanitized = False - _raw_key = getattr(self, "api_key", None) or "" - if _raw_key: - _clean_key = _strip_non_ascii(_raw_key) - if _clean_key != _raw_key: - self.api_key = _clean_key - if isinstance(getattr(self, "_client_kwargs", None), dict): - self._client_kwargs["api_key"] = _clean_key - # Also update the live client — it holds its - # own copy of api_key which auth_headers reads - # dynamically on every request. - if getattr(self, "client", None) is not None and hasattr(self.client, "api_key"): - self.client.api_key = _clean_key - _credential_sanitized = True - self._vprint( - f"{self.log_prefix}⚠️ API key contained non-ASCII characters " - f"(bad copy-paste?) — stripped them. If auth fails, " - f"re-copy the key from your provider's dashboard.", - force=True, - ) - - # Always retry on ASCII codec detection — - # _force_ascii_payload guarantees the full - # api_kwargs payload is sanitized on the - # next iteration (line ~8475). Even when - # per-component checks above find nothing - # (e.g. non-ASCII only in api_messages' - # reasoning_content), the flag catches it. - # Bounded by _unicode_sanitization_passes < 2. - self._unicode_sanitization_passes += 1 - _any_sanitized = ( - _messages_sanitized - or _prefill_sanitized - or _tools_sanitized - or _system_sanitized - or _headers_sanitized - or _credential_sanitized - ) - if _any_sanitized: - self._vprint( - f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from request payload. Retrying...", - force=True, - ) - else: - self._vprint( - f"{self.log_prefix}⚠️ System encoding is ASCII — enabling full-payload sanitization for retry...", - force=True, - ) - continue - - # ── Image-rejection recovery ────────────────────────────── - # Some providers (mlx-lm, text-only endpoints, text-only - # fallbacks on multimodal models) reject any message that - # contains image_url content with a 4xx error like - # "Only 'text' content type is supported." On first hit, - # strip all images from the message list, mark the session - # as vision-unsupported, and retry with text only. - # - # Detection is best-effort English phrase matching — a - # locale-translated or heavily-reworded upstream error - # will bypass this guard and fall through to the normal - # error handler. Expand the phrase list when new - # provider wordings are observed in the wild. - _err_body = "" - try: - _err_body = str(getattr(api_error, "body", None) or - getattr(api_error, "message", None) or - str(api_error)) - except Exception: - pass - _err_status = getattr(api_error, "status_code", None) - _IMAGE_REJECTION_PHRASES = ( - "only 'text' content type is supported", - "only text content type is supported", - "image_url is not supported", - "image content is not supported", - "multimodal is not supported", - "multimodal content is not supported", - "multimodal input is not supported", - "vision is not supported", - "vision input is not supported", - "does not support images", - "does not support image input", - "does not support multimodal", - "does not support vision", - "model does not support image", - # ChatGPT-account Codex backend - # (https://chatgpt.com/backend-api/codex) rejects - # data:image/...base64 URLs in input_image fields - # with HTTP 400 "Invalid 'input[N].content[K].image_url'. - # Expected a valid URL, but got a value with an - # invalid format." The OpenAI Responses API on the - # public endpoint accepts data URLs, but the - # ChatGPT-account variant does not. Without this - # phrase the agent cascaded into compression / - # context-too-large recovery instead of just - # stripping the images. Match is narrow on - # purpose — keyed on the field-path apostrophe so - # we don't false-trip on other URL validation - # errors. (issue #23570) - "image_url'. expected", - # DeepSeek's OpenAI-compatible API reports text-only - # request-body variants as: - # "unknown variant `image_url`, expected `text`". - "unknown variant `image_url`, expected `text`", - "unknown variant image_url, expected text", - ) - _err_lower = _err_body.lower() - _looks_like_image_rejection = any( - p in _err_lower for p in _IMAGE_REJECTION_PHRASES - ) - # 4xx-only gate: never interpret 5xx/timeout as "server - # said no to images" — those are transient and must - # route to the normal retry path. - _status_ok = _err_status is None or (400 <= int(_err_status) < 500) - if ( - getattr(self, "_vision_supported", True) - and _looks_like_image_rejection - and _status_ok - ): - self._vision_supported = False - _imgs_removed = _strip_images_from_messages(messages) - if isinstance(api_messages, list): - _strip_images_from_messages(api_messages) - self._vprint( - f"{self.log_prefix}⚠️ Server rejected image content — " - f"switching to text-only mode for this session" - + (". Stripped images from history and retrying." if _imgs_removed else "."), - force=True, - ) - continue - - status_code = getattr(api_error, "status_code", None) - error_context = self._extract_api_error_context(api_error) - - # ── Classify the error for structured recovery decisions ── - _compressor = getattr(self, "context_compressor", None) - _ctx_len = getattr(_compressor, "context_length", 200000) if _compressor else 200000 - classified = classify_api_error( - api_error, - provider=getattr(self, "provider", "") or "", - model=getattr(self, "model", "") or "", - approx_tokens=approx_tokens, - context_length=_ctx_len, - num_messages=len(api_messages) if api_messages else 0, - ) - logger.debug( - "Error classified: reason=%s status=%s retryable=%s compress=%s rotate=%s fallback=%s", - classified.reason.value, classified.status_code, - classified.retryable, classified.should_compress, - classified.should_rotate_credential, classified.should_fallback, - ) - - recovered_with_pool, has_retried_429 = self._recover_with_credential_pool( - status_code=status_code, - has_retried_429=has_retried_429, - classified_reason=classified.reason, - error_context=error_context, - ) - if recovered_with_pool: - continue - - # Image-too-large recovery: shrink oversized native image - # parts in-place and retry once. Triggered by Anthropic's - # per-image 5 MB ceiling (400 with "image exceeds 5 MB - # maximum") or any other provider that complains about - # image size. If shrink fails or a second attempt still - # fails, fall through to normal error handling. - if ( - classified.reason == FailoverReason.image_too_large - and not image_shrink_retry_attempted - ): - image_shrink_retry_attempted = True - if self._try_shrink_image_parts_in_messages(api_messages): - self._vprint( - f"{self.log_prefix}📐 Image(s) exceeded provider size limit — " - f"shrank and retrying...", - force=True, - ) - continue - else: - logger.info( - "image-shrink recovery: no data-URL image parts found " - "or shrink didn't reduce size; surfacing original error." - ) - - # Anthropic OAuth subscription rejected the 1M-context beta - # header ("long context beta is not yet available for this - # subscription"). Disable the beta for the rest of this - # session, rebuild the client, and retry once. 1M-capable - # subscriptions never hit this branch — they accept the - # beta and keep full 1M context. See PR #17680 for the - # original report (we chose reactive recovery over the - # proposed unconditional omit so capable subscriptions - # don't silently lose the capability). - if ( - classified.reason == FailoverReason.oauth_long_context_beta_forbidden - and self.api_mode == "anthropic_messages" - and self._is_anthropic_oauth - and not oauth_1m_beta_retry_attempted - ): - oauth_1m_beta_retry_attempted = True - if not getattr(self, "_oauth_1m_beta_disabled", False): - self._oauth_1m_beta_disabled = True - try: - self._anthropic_client.close() - except Exception: - pass - self._rebuild_anthropic_client() - self._vprint( - f"{self.log_prefix}🔕 OAuth subscription doesn't support " - f"the 1M-context beta — disabled for this session and retrying...", - force=True, - ) - continue - - if ( - self.api_mode == "codex_responses" - and self.provider in {"openai-codex", "xai-oauth"} - and status_code == 401 - and not codex_auth_retry_attempted - ): - codex_auth_retry_attempted = True - if self._try_refresh_codex_client_credentials(force=True): - _label = "xAI OAuth" if self.provider == "xai-oauth" else "Codex" - self._vprint(f"{self.log_prefix}🔐 {_label} auth refreshed after 401. Retrying request...") - continue - if ( - self.api_mode == "chat_completions" - and self.provider == "nous" - and status_code == 401 - and not nous_auth_retry_attempted - ): - nous_auth_retry_attempted = True - if self._try_refresh_nous_client_credentials(force=True): - print(f"{self.log_prefix}🔐 Nous agent key refreshed after 401. Retrying request...") - continue - # Credential refresh didn't help — show diagnostic info. - # Most common causes: Portal OAuth expired/revoked, - # account out of credits, or agent key blocked. - from hermes_constants import display_hermes_home as _dhh_fn - _dhh = _dhh_fn() - _body_text = "" - try: - _body = getattr(api_error, "body", None) or getattr(api_error, "response", None) - if _body is not None: - _body_text = str(_body)[:200] - except Exception: - pass - print(f"{self.log_prefix}🔐 Nous 401 — Portal authentication failed.") - if _body_text: - print(f"{self.log_prefix} Response: {_body_text}") - print(f"{self.log_prefix} Most likely: Portal OAuth expired, account out of credits, or agent key revoked.") - print(f"{self.log_prefix} Troubleshooting:") - print(f"{self.log_prefix} • Re-authenticate: hermes login --provider nous") - print(f"{self.log_prefix} • Check credits / billing: https://portal.nousresearch.com") - print(f"{self.log_prefix} • Verify stored credentials: {_dhh}/auth.json") - print(f"{self.log_prefix} • Switch providers temporarily: /model --provider openrouter") - if ( - self.provider == "copilot" - and status_code == 401 - and not copilot_auth_retry_attempted - ): - copilot_auth_retry_attempted = True - if self._try_refresh_copilot_client_credentials(): - self._vprint(f"{self.log_prefix}🔐 Copilot credentials refreshed after 401. Retrying request...") - continue - if ( - self.api_mode == "anthropic_messages" - and status_code == 401 - and hasattr(self, '_anthropic_api_key') - and not anthropic_auth_retry_attempted - ): - anthropic_auth_retry_attempted = True - from agent.anthropic_adapter import _is_oauth_token - if self._try_refresh_anthropic_client_credentials(): - print(f"{self.log_prefix}🔐 Anthropic credentials refreshed after 401. Retrying request...") - continue - # Credential refresh didn't help — show diagnostic info - key = self._anthropic_api_key - auth_method = "Bearer (OAuth/setup-token)" if _is_oauth_token(key) else "x-api-key (API key)" - print(f"{self.log_prefix}🔐 Anthropic 401 — authentication failed.") - print(f"{self.log_prefix} Auth method: {auth_method}") - print(f"{self.log_prefix} Token prefix: {key[:12]}..." if key and len(key) > 12 else f"{self.log_prefix} Token: (empty or short)") - print(f"{self.log_prefix} Troubleshooting:") - from hermes_constants import display_hermes_home as _dhh_fn - _dhh = _dhh_fn() - print(f"{self.log_prefix} • Check ANTHROPIC_TOKEN in {_dhh}/.env for Hermes-managed OAuth/setup tokens") - print(f"{self.log_prefix} • Check ANTHROPIC_API_KEY in {_dhh}/.env for API keys or legacy token values") - print(f"{self.log_prefix} • For API keys: verify at https://platform.claude.com/settings/keys") - print(f"{self.log_prefix} • For Claude Code: run 'claude /login' to refresh, then retry") - print(f"{self.log_prefix} • Legacy cleanup: hermes config set ANTHROPIC_TOKEN \"\"") - print(f"{self.log_prefix} • Clear stale keys: hermes config set ANTHROPIC_API_KEY \"\"") - - # ── Thinking block signature recovery ───────────────── - # Anthropic signs thinking blocks against the full turn - # content. Any upstream mutation (context compression, - # session truncation, message merging) invalidates the - # signature → HTTP 400. Recovery: strip reasoning_details - # from all messages so the next retry sends no thinking - # blocks at all. One-shot — don't retry infinitely. - if ( - classified.reason == FailoverReason.thinking_signature - and not thinking_sig_retry_attempted - ): - thinking_sig_retry_attempted = True - for _m in messages: - if isinstance(_m, dict): - _m.pop("reasoning_details", None) - self._vprint( - f"{self.log_prefix}⚠️ Thinking block signature invalid — " - f"stripped all thinking blocks, retrying...", - force=True, - ) - logging.warning( - "%sThinking block signature recovery: stripped " - "reasoning_details from %d messages", - self.log_prefix, len(messages), - ) - continue - - # ── llama.cpp grammar-parse recovery ────────────────── - # llama.cpp's ``json-schema-to-grammar`` converter rejects - # regex escape classes (``\d``, ``\w``, ``\s``) and most - # ``format`` values in tool schemas. MCP servers emit - # these routinely for date/phone/email params. Recovery: - # strip ``pattern``/``format`` from ``self.tools`` and - # retry once. We keep the keywords by default so cloud - # providers get the full prompting hints; this branch - # fires only for users on llama.cpp's OAI server. - if ( - classified.reason == FailoverReason.llama_cpp_grammar_pattern - and not llama_cpp_grammar_retry_attempted - ): - llama_cpp_grammar_retry_attempted = True - try: - from tools.schema_sanitizer import strip_pattern_and_format - _, _stripped = strip_pattern_and_format(self.tools) - except Exception as _strip_exc: # pragma: no cover — defensive - logging.warning( - "%sllama.cpp grammar recovery: strip helper failed: %s", - self.log_prefix, _strip_exc, - ) - _stripped = 0 - if _stripped: - self._vprint( - f"{self.log_prefix}⚠️ llama.cpp rejected tool schema grammar — " - f"stripped {_stripped} pattern/format keyword(s), retrying...", - force=True, - ) - logging.warning( - "%sllama.cpp grammar recovery: stripped %d " - "pattern/format keyword(s) from tool schemas", - self.log_prefix, _stripped, - ) - continue - # No keywords found to strip — fall through to normal - # retry path rather than loop forever on the same error. - logging.warning( - "%sllama.cpp grammar error but no pattern/format " - "keywords to strip — falling through to normal retry", - self.log_prefix, - ) - - retry_count += 1 - elapsed_time = time.time() - api_start_time - self._touch_activity( - f"API error recovery (attempt {retry_count}/{max_retries})" - ) - - error_type = type(api_error).__name__ - error_msg = str(api_error).lower() - _error_summary = self._summarize_api_error(api_error) - logger.warning( - "API call failed (attempt %s/%s) error_type=%s %s summary=%s", - retry_count, - max_retries, - error_type, - self._client_log_context(), - _error_summary, - ) - - _provider = getattr(self, "provider", "unknown") - _base = getattr(self, "base_url", "unknown") - _model = getattr(self, "model", "unknown") - _status_code_str = f" [HTTP {status_code}]" if status_code else "" - self._vprint(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}{_status_code_str}", force=True) - self._vprint(f"{self.log_prefix} 🔌 Provider: {_provider} Model: {_model}", force=True) - self._vprint(f"{self.log_prefix} 🌐 Endpoint: {_base}", force=True) - self._vprint(f"{self.log_prefix} 📝 Error: {_error_summary}", force=True) - if status_code and status_code < 500: - _err_body = getattr(api_error, "body", None) - _err_body_str = str(_err_body)[:300] if _err_body else None - if _err_body_str: - self._vprint(f"{self.log_prefix} 📋 Details: {_err_body_str}", force=True) - self._vprint(f"{self.log_prefix} ⏱️ Elapsed: {elapsed_time:.2f}s Context: {len(api_messages)} msgs, ~{approx_tokens:,} tokens") - - # Actionable hint for OpenRouter "no tool endpoints" error. - # This fires regardless of whether fallback succeeds — the - # user needs to know WHY their model failed so they can fix - # their provider routing, not just silently fall back. - if ( - self._is_openrouter_url() - and "support tool use" in error_msg - ): - self._vprint( - f"{self.log_prefix} 💡 No OpenRouter providers for {_model} support tool calling with your current settings.", - force=True, - ) - if self.providers_allowed: - self._vprint( - f"{self.log_prefix} Your provider_routing.only restriction is filtering out tool-capable providers.", - force=True, - ) - self._vprint( - f"{self.log_prefix} Try removing the restriction or adding providers that support tools for this model.", - force=True, - ) - self._vprint( - f"{self.log_prefix} Check which providers support tools: https://openrouter.ai/models/{_model}", - force=True, - ) - - # Check for interrupt before deciding to retry - if self._interrupt_requested: - self._vprint(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.", force=True) - self._persist_session(messages, conversation_history) - self.clear_interrupt() - return { - "final_response": f"Operation interrupted: handling API error ({error_type}: {self._clean_error_message(str(api_error))}).", - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "interrupted": True, - } - - # Actionable hint for GitHub Models (Azure) 413 errors. - # The free tier enforces a hard 8K token cap per request, - # which Hermes' system prompt + tool schemas alone exceed. - # Compression can't help — the floor is the system prompt - # itself, not the conversation — so surface a clear "not - # compatible" message instead of looping into three futile - # compression attempts. - if ( - status_code == 413 - and isinstance(_base, str) - and "models.inference.ai.azure.com" in _base - ): - self._vprint( - f"{self.log_prefix} 💡 GitHub Models free tier (models.inference.ai.azure.com) caps every", - force=True, - ) - self._vprint( - f"{self.log_prefix} request at ~8K tokens. Hermes' system prompt + tool schemas baseline", - force=True, - ) - self._vprint( - f"{self.log_prefix} exceeds that floor, so this endpoint cannot run an agentic loop.", - force=True, - ) - self._vprint( - f"{self.log_prefix} Use the `copilot` provider with a Copilot subscription token (`hermes", - force=True, - ) - self._vprint( - f"{self.log_prefix} setup` → GitHub Copilot), or pick any other provider.", - force=True, - ) - - # Check for 413 payload-too-large BEFORE generic 4xx handler. - # A 413 is a payload-size error — the correct response is to - # compress history and retry, not abort immediately. - status_code = getattr(api_error, "status_code", None) - - # ── Anthropic Sonnet long-context tier gate ─────────── - # Anthropic returns HTTP 429 "Extra usage is required for - # long context requests" when a Claude Max (or similar) - # subscription doesn't include the 1M-context tier. This - # is NOT a transient rate limit — retrying or switching - # credentials won't help. Reduce context to 200k (the - # standard tier) and compress. - if classified.reason == FailoverReason.long_context_tier: - _reduced_ctx = 200000 - compressor = self.context_compressor - old_ctx = compressor.context_length - if old_ctx > _reduced_ctx: - compressor.update_model( - model=self.model, - context_length=_reduced_ctx, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - provider=self.provider, - ) - # Context probing flags — only set on built-in - # compressor (plugin engines manage their own). - if hasattr(compressor, "_context_probed"): - compressor._context_probed = True - # Don't persist — this is a subscription-tier - # limitation, not a model capability. If the - # user later enables extra usage the 1M limit - # should come back automatically. - compressor._context_probe_persistable = False - self._vprint( - f"{self.log_prefix}⚠️ Anthropic long-context tier " - f"requires extra usage — reducing context: " - f"{old_ctx:,} → {_reduced_ctx:,} tokens", - force=True, - ) - - compression_attempts += 1 - if compression_attempts <= max_compression_attempts: - original_len = len(messages) - messages, active_system_prompt = self._compress_context( - messages, system_message, - approx_tokens=approx_tokens, - task_id=effective_task_id, - ) - # Compression created a new session — clear history - # so _flush_messages_to_session_db writes compressed - # messages to the new session, not skipping them. - conversation_history = None - if len(messages) < original_len or old_ctx > _reduced_ctx: - self._emit_status( - f"🗜️ Context reduced to {_reduced_ctx:,} tokens " - f"(was {old_ctx:,}), retrying..." - ) - time.sleep(2) - restart_with_compressed_messages = True - break - # Fall through to normal error handling if compression - # is exhausted or didn't help. - - # Eager fallback for rate-limit errors (429 or quota exhaustion). - # When a fallback model is configured, switch immediately instead - # of burning through retries with exponential backoff -- the - # primary provider won't recover within the retry window. - is_rate_limited = classified.reason in { - FailoverReason.rate_limit, - FailoverReason.billing, - } - if is_rate_limited and self._fallback_index < len(self._fallback_chain): - # Don't eagerly fallback if credential pool rotation may - # still recover. See _pool_may_recover_from_rate_limit - # for the single-credential-pool and CloudCode-quota - # exceptions. Fixes #11314 and #13636. - pool_may_recover = _pool_may_recover_from_rate_limit( - self._credential_pool, - provider=self.provider, - base_url=getattr(self, "base_url", None), - ) - if not pool_may_recover: - self._emit_status("⚠️ Rate limited — switching to fallback provider...") - if self._try_activate_fallback(reason=classified.reason): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - - # ── Nous Portal: record rate limit & skip retries ───── - # When Nous returns a 429 that is a genuine account- - # level rate limit, record the reset time to a shared - # file so ALL sessions (cron, gateway, auxiliary) know - # not to pile on, then skip further retries -- each - # one burns another RPH request and deepens the hole. - # The retry loop's top-of-iteration guard will catch - # this on the next pass and try fallback or bail. - # - # IMPORTANT: Nous Portal multiplexes multiple upstream - # providers (DeepSeek, Kimi, MiMo, Hermes). A 429 can - # also mean an UPSTREAM provider is out of capacity - # for one specific model -- transient, clears in - # seconds, nothing to do with the caller's quota. - # Tripping the cross-session breaker on that would - # block every Nous model for minutes. We use - # ``is_genuine_nous_rate_limit`` to tell the two - # apart via the 429's own x-ratelimit-* headers and - # the last-known-good state captured on the previous - # successful response. - if ( - is_rate_limited - and self.provider == "nous" - and classified.reason == FailoverReason.rate_limit - and not recovered_with_pool - ): - _genuine_nous_rate_limit = False - try: - from agent.nous_rate_guard import ( - is_genuine_nous_rate_limit, - record_nous_rate_limit, - ) - _err_resp = getattr(api_error, "response", None) - _err_hdrs = ( - getattr(_err_resp, "headers", None) - if _err_resp else None - ) - _genuine_nous_rate_limit = is_genuine_nous_rate_limit( - headers=_err_hdrs, - last_known_state=self._rate_limit_state, - ) - if _genuine_nous_rate_limit: - record_nous_rate_limit( - headers=_err_hdrs, - error_context=error_context, - ) - else: - logging.info( - "Nous 429 looks like upstream capacity " - "(no exhausted bucket in headers or " - "last-known state) -- not tripping " - "cross-session breaker." - ) - except Exception: - pass - if _genuine_nous_rate_limit: - # Skip straight to max_retries -- the - # top-of-loop guard will handle fallback or - # bail cleanly. - retry_count = max_retries - continue - # Upstream capacity 429: fall through to normal - # retry logic. A different model (or the same - # model a moment later) will typically succeed. - - is_payload_too_large = ( - classified.reason == FailoverReason.payload_too_large - ) - - if is_payload_too_large: - compression_attempts += 1 - if compression_attempts > max_compression_attempts: - self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True) - self._vprint(f"{self.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True) - logging.error(f"{self.log_prefix}413 compression failed after {max_compression_attempts} attempts.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - self._emit_status(f"⚠️ Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...") - - original_len = len(messages) - messages, active_system_prompt = self._compress_context( - messages, system_message, approx_tokens=approx_tokens, - task_id=effective_task_id, - ) - # Compression created a new session — clear history - # so _flush_messages_to_session_db writes compressed - # messages to the new session, not skipping them. - conversation_history = None - - if len(messages) < original_len: - self._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") - time.sleep(2) # Brief pause between compression retries - restart_with_compressed_messages = True - break - else: - self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.", force=True) - self._vprint(f"{self.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True) - logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": "Request payload too large (413). Cannot compress further.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - - # Check for context-length errors BEFORE generic 4xx handler. - # The classifier detects context overflow from: explicit error - # messages, generic 400 + large session heuristic (#1630), and - # server disconnect + large session pattern (#2153). - is_context_length_error = ( - classified.reason == FailoverReason.context_overflow - ) - - if is_context_length_error: - compressor = self.context_compressor - old_ctx = compressor.context_length - - # ── Distinguish two very different errors ─────────── - # 1. "Prompt too long": the INPUT exceeds the context window. - # Fix: reduce context_length + compress history. - # 2. "max_tokens too large": input is fine, but - # input_tokens + requested max_tokens > context_window. - # Fix: reduce max_tokens (the OUTPUT cap) for this call. - # Do NOT shrink context_length — the window is unchanged. - # - # Note: max_tokens = output token cap (one response). - # context_length = total window (input + output combined). - available_out = parse_available_output_tokens_from_error(error_msg) - if available_out is not None: - # Error is purely about the output cap being too large. - # Cap output to the available space and retry without - # touching context_length or triggering compression. - safe_out = max(1, available_out - 64) # small safety margin - self._ephemeral_max_output_tokens = safe_out - self._vprint( - f"{self.log_prefix}⚠️ Output cap too large for current prompt — " - f"retrying with max_tokens={safe_out:,} " - f"(available_tokens={available_out:,}; context_length unchanged at {old_ctx:,})", - force=True, - ) - # Still count against compression_attempts so we don't - # loop forever if the error keeps recurring. - compression_attempts += 1 - if compression_attempts > max_compression_attempts: - self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) - self._vprint(f"{self.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True) - logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - restart_with_compressed_messages = True - break - - # Error is about the INPUT being too large — reduce context_length. - # Try to parse the actual limit from the error message - parsed_limit = parse_context_limit_from_error(error_msg) - _provider_lower = (getattr(self, "provider", "") or "").lower() - _base_lower = (getattr(self, "base_url", "") or "").rstrip("/").lower() - is_minimax_provider = ( - _provider_lower in {"minimax", "minimax-cn"} - or _base_lower.startswith(( - "https://api.minimax.io/anthropic", - "https://api.minimaxi.com/anthropic", - )) - ) - minimax_delta_only_overflow = ( - is_minimax_provider - and parsed_limit is None - and "context window exceeds limit (" in error_msg - ) - if parsed_limit and parsed_limit < old_ctx: - new_ctx = parsed_limit - self._vprint(f"{self.log_prefix}Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True) - elif minimax_delta_only_overflow: - new_ctx = old_ctx - self._vprint( - f"{self.log_prefix}Provider reported overflow amount only; " - f"keeping context_length at {old_ctx:,} tokens and compressing.", - force=True, - ) - else: - # Step down to the next probe tier - new_ctx = get_next_probe_tier(old_ctx) - - if new_ctx and new_ctx < old_ctx: - compressor.update_model( - model=self.model, - context_length=new_ctx, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - provider=self.provider, - ) - # Context probing flags — only set on built-in - # compressor (plugin engines manage their own). - if hasattr(compressor, "_context_probed"): - compressor._context_probed = True - # Only persist limits parsed from the provider's - # error message (a real number). Guessed fallback - # tiers from get_next_probe_tier() should stay - # in-memory only — persisting them pollutes the - # cache with wrong values. - compressor._context_probe_persistable = bool( - parsed_limit and parsed_limit == new_ctx - ) - self._vprint(f"{self.log_prefix}⚠️ Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens", force=True) - else: - self._vprint(f"{self.log_prefix}⚠️ Context length exceeded at minimum tier — attempting compression...", force=True) - - compression_attempts += 1 - if compression_attempts > max_compression_attempts: - self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) - self._vprint(f"{self.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True) - logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - self._emit_status(f"🗜️ Context too large (~{approx_tokens:,} tokens) — compressing ({compression_attempts}/{max_compression_attempts})...") - - original_len = len(messages) - messages, active_system_prompt = self._compress_context( - messages, system_message, approx_tokens=approx_tokens, - task_id=effective_task_id, - ) - # Compression created a new session — clear history - # so _flush_messages_to_session_db writes compressed - # messages to the new session, not skipping them. - conversation_history = None - - if len(messages) < original_len or new_ctx and new_ctx < old_ctx: - if len(messages) < original_len: - self._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") - time.sleep(2) # Brief pause between compression retries - restart_with_compressed_messages = True - break - else: - # Can't compress further and already at minimum tier - self._vprint(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.", force=True) - self._vprint(f"{self.log_prefix} 💡 The conversation has accumulated too much content. Try /new to start fresh, or /compress to manually trigger compression.", force=True) - logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - - # Check for non-retryable client errors. The classifier - # already accounts for 413, 429, 529 (transient), context - # overflow, and generic-400 heuristics. Local validation - # errors (ValueError, TypeError) are programming bugs. - # Exclude UnicodeEncodeError — it's a ValueError subclass - # but is handled separately by the surrogate sanitization - # path above. Exclude json.JSONDecodeError — also a - # ValueError subclass, but it indicates a transient - # provider/network failure (malformed response body, - # truncated stream, routing layer corruption), not a - # local programming bug, and should be retried (#14782). - # Exclude Anthropic stream parser ValueErrors for the - # same reason: third-party Anthropic-compatible providers - # can emit malformed event-stream frames that SDK parsers - # raise as plain ValueError. - is_local_validation_error = ( - isinstance(api_error, (ValueError, TypeError)) - and not isinstance( - api_error, (UnicodeEncodeError, json.JSONDecodeError) - ) - and not self._is_provider_stream_parse_error(api_error) - # ssl.SSLError (and its subclass SSLCertVerificationError) - # inherits from OSError *and* ValueError via Python MRO, - # so the isinstance(ValueError) check above would - # misclassify a TLS transport failure as a local - # programming bug and abort without retrying. Exclude - # ssl.SSLError explicitly so the error classifier's - # retryable=True mapping takes effect instead. - and not isinstance(api_error, ssl.SSLError) - ) - is_client_error = ( - is_local_validation_error - or ( - not classified.retryable - and not classified.should_compress - and classified.reason not in { - FailoverReason.rate_limit, - FailoverReason.billing, - FailoverReason.overloaded, - FailoverReason.context_overflow, - FailoverReason.payload_too_large, - FailoverReason.long_context_tier, - FailoverReason.thinking_signature, - } - ) - ) and not is_context_length_error - - if is_client_error: - # Try fallback before aborting — a different provider - # may not have the same issue (rate limit, auth, etc.) - self._emit_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - if api_kwargs is not None: - self._dump_api_request_debug( - api_kwargs, reason="non_retryable_client_error", error=api_error, - ) - self._emit_status( - f"❌ Non-retryable error (HTTP {status_code}): " - f"{self._summarize_api_error(api_error)}" - ) - self._vprint(f"{self.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True) - self._vprint(f"{self.log_prefix} 🔌 Provider: {_provider} Model: {_model}", force=True) - self._vprint(f"{self.log_prefix} 🌐 Endpoint: {_base}", force=True) - # Actionable guidance for common auth errors - if classified.is_auth or classified.reason == FailoverReason.billing: - if _provider in {"openai-codex", "xai-oauth"} and status_code == 401: - if _provider == "openai-codex": - self._vprint(f"{self.log_prefix} 💡 Codex OAuth token was rejected (HTTP 401). Your token may have been", force=True) - self._vprint(f"{self.log_prefix} refreshed by another client (Codex CLI, VS Code). To fix:", force=True) - self._vprint(f"{self.log_prefix} 1. Run `codex` in your terminal to generate fresh tokens.", force=True) - self._vprint(f"{self.log_prefix} 2. Then run `hermes auth` to re-authenticate.", force=True) - else: - self._vprint(f"{self.log_prefix} 💡 xAI OAuth token was rejected (HTTP 401). To fix:", force=True) - self._vprint(f"{self.log_prefix} re-authenticate with xAI Grok OAuth (SuperGrok Subscription) from `hermes model`.", force=True) - else: - self._vprint(f"{self.log_prefix} 💡 Your API key was rejected by the provider. Check:", force=True) - self._vprint(f"{self.log_prefix} • Is the key valid? Run: hermes setup", force=True) - self._vprint(f"{self.log_prefix} • Does your account have access to {_model}?", force=True) - if base_url_host_matches(str(_base), "openrouter.ai"): - self._vprint(f"{self.log_prefix} • Check credits: https://openrouter.ai/settings/credits", force=True) - else: - self._vprint(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.", force=True) - logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}") - # Skip session persistence when the error is likely - # context-overflow related (status 400 + large session). - # Persisting the failed user message would make the - # session even larger, causing the same failure on the - # next attempt. (#1630) - if status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80): - self._vprint( - f"{self.log_prefix}⚠️ Skipping session persistence " - f"for large failed session to prevent growth loop.", - force=True, - ) - else: - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "failed": True, - "error": str(api_error), - } - - if retry_count >= max_retries: - # Before falling back, try rebuilding the primary - # client once for transient transport errors (stale - # connection pool, TCP reset). Only attempted once - # per API call block. - if not primary_recovery_attempted and self._try_recover_primary_transport( - api_error, retry_count=retry_count, max_retries=max_retries, - ): - primary_recovery_attempted = True - retry_count = 0 - continue - # Try fallback before giving up entirely - self._emit_status(f"⚠️ Max retries ({max_retries}) exhausted — trying fallback...") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - _final_summary = self._summarize_api_error(api_error) - if is_rate_limited: - self._emit_status(f"❌ Rate limited after {max_retries} retries — {_final_summary}") - else: - self._emit_status(f"❌ API failed after {max_retries} retries — {_final_summary}") - self._vprint(f"{self.log_prefix} 💀 Final error: {_final_summary}", force=True) - - # Detect SSE stream-drop pattern (e.g. "Network - # connection lost") and surface actionable guidance. - # This typically happens when the model generates a - # very large tool call (write_file with huge content) - # and the proxy/CDN drops the stream mid-response. - _is_stream_drop = ( - not getattr(api_error, "status_code", None) - and any(p in error_msg for p in ( - "connection lost", "connection reset", - "connection closed", "network connection", - "network error", "terminated", - )) - ) - if _is_stream_drop: - self._vprint( - f"{self.log_prefix} 💡 The provider's stream " - f"connection keeps dropping. This often happens " - f"when the model tries to write a very large " - f"file in a single tool call.", - force=True, - ) - self._vprint( - f"{self.log_prefix} Try asking the model " - f"to use execute_code with Python's open() for " - f"large files, or to write the file in smaller " - f"sections.", - force=True, - ) - - logging.error( - "%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s", - self.log_prefix, max_retries, _final_summary, - _provider, _model, len(api_messages), f"{approx_tokens:,}", - ) - if api_kwargs is not None: - self._dump_api_request_debug( - api_kwargs, reason="max_retries_exhausted", error=api_error, - ) - self._persist_session(messages, conversation_history) - _final_response = f"API call failed after {max_retries} retries: {_final_summary}" - if _is_stream_drop: - _final_response += ( - "\n\nThe provider's stream connection keeps " - "dropping — this often happens when generating " - "very large tool call responses (e.g. write_file " - "with long content). Try asking me to use " - "execute_code with Python's open() for large " - "files, or to write in smaller sections." - ) - return { - "final_response": _final_response, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "failed": True, - "error": _final_summary, - } - - # For rate limits, respect the Retry-After header if present - _retry_after = None - if is_rate_limited: - _resp_headers = getattr(getattr(api_error, "response", None), "headers", None) - if _resp_headers and hasattr(_resp_headers, "get"): - _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After") - if _ra_raw: - try: - _retry_after = min(float(_ra_raw), 120) # Cap at 2 minutes - except (TypeError, ValueError): - pass - wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0) - if is_rate_limited: - self._emit_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...") - else: - self._emit_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...") - logger.warning( - "Retrying API call in %ss (attempt %s/%s) %s error=%s", - wait_time, - retry_count, - max_retries, - self._client_log_context(), - api_error, - ) - # Sleep in small increments so we can respond to interrupts quickly - # instead of blocking the entire wait_time in one sleep() call - sleep_end = time.time() + wait_time - _backoff_touch_counter = 0 - while time.time() < sleep_end: - if self._interrupt_requested: - self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True) - self._persist_session(messages, conversation_history) - self.clear_interrupt() - return { - "final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).", - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "interrupted": True, - } - time.sleep(0.2) # Check interrupt every 200ms - # Touch activity every ~30s so the gateway's inactivity - # monitor knows we're alive during backoff waits. - _backoff_touch_counter += 1 - if _backoff_touch_counter % 150 == 0: # 150 × 0.2s = 30s - self._touch_activity( - f"error retry backoff ({retry_count}/{max_retries}), " - f"{int(sleep_end - time.time())}s remaining" - ) - - # If the API call was interrupted, skip response processing - if interrupted: - _turn_exit_reason = "interrupted_during_api_call" - break - - if restart_with_compressed_messages: - api_call_count -= 1 - self.iteration_budget.refund() - # Count compression restarts toward the retry limit to prevent - # infinite loops when compression reduces messages but not enough - # to fit the context window. - retry_count += 1 - restart_with_compressed_messages = False - continue - - if restart_with_length_continuation: - # Progressively boost the output token budget on each retry. - # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768. - # Applies to all providers via _ephemeral_max_output_tokens. - _boost_base = self.max_tokens if self.max_tokens else 4096 - _boost = _boost_base * (length_continue_retries + 1) - self._ephemeral_max_output_tokens = min(_boost, 32768) - continue - - # Guard: if all retries exhausted without a successful response - # (e.g. repeated context-length errors that exhausted retry_count), - # the `response` variable is still None. Break out cleanly. - if response is None: - _turn_exit_reason = "all_retries_exhausted_no_response" - print(f"{self.log_prefix}❌ All API retries exhausted with no successful response.") - self._persist_session(messages, conversation_history) - break - - try: - _transport = self._get_transport() - _normalize_kwargs = {} - if self.api_mode == "anthropic_messages": - _normalize_kwargs["strip_tool_prefix"] = self._is_anthropic_oauth - normalized = _transport.normalize_response(response, **_normalize_kwargs) - assistant_message = normalized - finish_reason = normalized.finish_reason - - # Normalize content to string — some OpenAI-compatible servers - # (llama-server, etc.) return content as a dict or list instead - # of a plain string, which crashes downstream .strip() calls. - if assistant_message.content is not None and not isinstance(assistant_message.content, str): - raw = assistant_message.content - if isinstance(raw, dict): - assistant_message.content = raw.get("text", "") or raw.get("content", "") or json.dumps(raw) - elif isinstance(raw, list): - # Multimodal content list — extract text parts - parts = [] - for part in raw: - if isinstance(part, str): - parts.append(part) - elif isinstance(part, dict) and part.get("type") == "text": - parts.append(part.get("text", "")) - elif isinstance(part, dict) and "text" in part: - parts.append(str(part["text"])) - assistant_message.content = "\n".join(parts) - else: - assistant_message.content = str(raw) - - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _assistant_tool_calls = getattr(assistant_message, "tool_calls", None) or [] - _assistant_text = assistant_message.content or "" - _invoke_hook( - "post_api_request", - task_id=effective_task_id, - session_id=self.session_id or "", - platform=self.platform or "", - model=self.model, - provider=self.provider, - base_url=self.base_url, - api_mode=self.api_mode, - api_call_count=api_call_count, - api_duration=api_duration, - finish_reason=finish_reason, - message_count=len(api_messages), - response_model=getattr(response, "model", None), - response=response, - usage=self._usage_summary_for_api_request_hook(response), - assistant_message=assistant_message, - assistant_content_chars=len(_assistant_text), - assistant_tool_call_count=len(_assistant_tool_calls), - ) - except Exception: - pass - - # Handle assistant response - if assistant_message.content and not self.quiet_mode: - if self.verbose_logging: - self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content}") - else: - self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}") - - # Notify progress callback of model's thinking (used by subagent - # delegation to relay the child's reasoning to the parent display). - if (assistant_message.content and self.tool_progress_callback): - _think_text = assistant_message.content.strip() - # Strip reasoning XML tags that shouldn't leak to parent display - _think_text = re.sub( - r'', '', _think_text - ).strip() - # For subagents: relay first line to parent display (existing behaviour). - # For all agents with a structured callback: emit reasoning.available event. - first_line = _think_text.split('\n')[0][:80] if _think_text else "" - if first_line and getattr(self, '_delegate_depth', 0) > 0: - try: - self.tool_progress_callback("_thinking", first_line) - except Exception: - pass - elif _think_text: - try: - self.tool_progress_callback("reasoning.available", "_thinking", _think_text[:500], None) - except Exception: - pass - - # Check for incomplete (opened but never closed) - # This means the model ran out of output tokens mid-reasoning — retry up to 2 times - if has_incomplete_scratchpad(assistant_message.content or ""): - self._incomplete_scratchpad_retries += 1 - - self._vprint(f"{self.log_prefix}⚠️ Incomplete detected (opened but never closed)") - - if self._incomplete_scratchpad_retries <= 2: - self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...") - # Don't add the broken message, just retry - continue - else: - # Max retries - discard this turn and save as partial - self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True) - self._incomplete_scratchpad_retries = 0 - - rolled_back_messages = self._get_messages_up_to_last_assistant(messages) - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - - return { - "final_response": None, - "messages": rolled_back_messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Incomplete REASONING_SCRATCHPAD after 2 retries" - } - - # Reset incomplete scratchpad counter on clean response - self._incomplete_scratchpad_retries = 0 - - if self.api_mode == "codex_responses" and finish_reason == "incomplete": - self._codex_incomplete_retries += 1 - - interim_msg = self._build_assistant_message(assistant_message, finish_reason) - interim_has_content = bool((interim_msg.get("content") or "").strip()) - interim_has_reasoning = bool(interim_msg.get("reasoning", "").strip()) if isinstance(interim_msg.get("reasoning"), str) else False - interim_has_codex_reasoning = bool(interim_msg.get("codex_reasoning_items")) - interim_has_codex_message_items = bool(interim_msg.get("codex_message_items")) - - if ( - interim_has_content - or interim_has_reasoning - or interim_has_codex_reasoning - or interim_has_codex_message_items - ): - last_msg = messages[-1] if messages else None - # Duplicate detection: two consecutive incomplete assistant - # messages with identical content AND reasoning are collapsed. - # For provider-state-only changes (encrypted reasoning - # items or replayable message ids/phases/statuses differ - # while visible content/reasoning are unchanged), compare - # those opaque payloads too so we don't silently drop the - # newer continuation state. - last_codex_items = last_msg.get("codex_reasoning_items") if isinstance(last_msg, dict) else None - interim_codex_items = interim_msg.get("codex_reasoning_items") - last_codex_message_items = last_msg.get("codex_message_items") if isinstance(last_msg, dict) else None - interim_codex_message_items = interim_msg.get("codex_message_items") - duplicate_interim = ( - isinstance(last_msg, dict) - and last_msg.get("role") == "assistant" - and last_msg.get("finish_reason") == "incomplete" - and (last_msg.get("content") or "") == (interim_msg.get("content") or "") - and (last_msg.get("reasoning") or "") == (interim_msg.get("reasoning") or "") - and last_codex_items == interim_codex_items - and last_codex_message_items == interim_codex_message_items - ) - if not duplicate_interim: - messages.append(interim_msg) - self._emit_interim_assistant_message(interim_msg) - - if self._codex_incomplete_retries < 3: - if not self.quiet_mode: - self._vprint(f"{self.log_prefix}↻ Codex response incomplete; continuing turn ({self._codex_incomplete_retries}/3)") - self._session_messages = messages - self._save_session_log(messages) - continue - - self._codex_incomplete_retries = 0 - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Codex response remained incomplete after 3 continuation attempts", - } - elif hasattr(self, "_codex_incomplete_retries"): - self._codex_incomplete_retries = 0 - - # Check for tool calls - if assistant_message.tool_calls: - if not self.quiet_mode: - self._vprint(f"{self.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...") - - if self.verbose_logging: - for tc in assistant_message.tool_calls: - logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...") - - # Validate tool call names - detect model hallucinations - # Repair mismatched tool names before validating - for tc in assistant_message.tool_calls: - if tc.function.name not in self.valid_tool_names: - repaired = self._repair_tool_call(tc.function.name) - if repaired: - print(f"{self.log_prefix}🔧 Auto-repaired tool name: '{tc.function.name}' -> '{repaired}'") - tc.function.name = repaired - invalid_tool_calls = [ - tc.function.name for tc in assistant_message.tool_calls - if tc.function.name not in self.valid_tool_names - ] - if invalid_tool_calls: - # Track retries for invalid tool calls - self._invalid_tool_retries += 1 - - # Return helpful error to model — model can self-correct next turn - available = ", ".join(sorted(self.valid_tool_names)) - invalid_name = invalid_tool_calls[0] - invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name - self._vprint(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for self-correction ({self._invalid_tool_retries}/3)") - - if self._invalid_tool_retries >= 3: - self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True) - self._invalid_tool_retries = 0 - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": f"Model generated invalid tool call: {invalid_preview}" - } - - assistant_msg = self._build_assistant_message(assistant_message, finish_reason) - messages.append(assistant_msg) - for tc in assistant_message.tool_calls: - if tc.function.name not in self.valid_tool_names: - content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}" - else: - content = "Skipped: another tool call in this turn used an invalid name. Please retry this tool call." - messages.append({ - "role": "tool", - "name": tc.function.name, - "tool_call_id": tc.id, - "content": content, - }) - continue - # Reset retry counter on successful tool call validation - self._invalid_tool_retries = 0 - - # Validate tool call arguments are valid JSON - # Handle empty strings as empty objects (common model quirk) - invalid_json_args = [] - for tc in assistant_message.tool_calls: - args = tc.function.arguments - if isinstance(args, (dict, list)): - tc.function.arguments = json.dumps(args) - continue - if args is not None and not isinstance(args, str): - tc.function.arguments = str(args) - args = tc.function.arguments - # Treat empty/whitespace strings as empty object - if not args or not args.strip(): - tc.function.arguments = "{}" - continue - try: - json.loads(args) - except json.JSONDecodeError as e: - invalid_json_args.append((tc.function.name, str(e))) - - if invalid_json_args: - # Check if the invalid JSON is due to truncation rather - # than a model formatting mistake. Routers sometimes - # rewrite finish_reason from "length" to "tool_calls", - # hiding the truncation from the length handler above. - # Detect truncation: args that don't end with } or ] - # (after stripping whitespace) are cut off mid-stream. - _truncated = any( - not (tc.function.arguments or "").rstrip().endswith(("}", "]")) - for tc in assistant_message.tool_calls - if tc.function.name in {n for n, _ in invalid_json_args} - ) - if _truncated: - self._vprint( - f"{self.log_prefix}⚠️ Truncated tool call arguments detected " - f"(finish_reason={finish_reason!r}) — refusing to execute.", - force=True, - ) - self._invalid_json_retries = 0 - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Response truncated due to output length limit", - } - - # Track retries for invalid JSON arguments - self._invalid_json_retries += 1 - - tool_name, error_msg = invalid_json_args[0] - self._vprint(f"{self.log_prefix}⚠️ Invalid JSON in tool call arguments for '{tool_name}': {error_msg}") - - if self._invalid_json_retries < 3: - self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_json_retries}/3)...") - # Don't add anything to messages, just retry the API call - continue - else: - # Instead of returning partial, inject tool error results so the model can recover. - # Using tool results (not user messages) preserves role alternation. - self._vprint(f"{self.log_prefix}⚠️ Injecting recovery tool results for invalid JSON...") - self._invalid_json_retries = 0 # Reset for next attempt - - # Append the assistant message with its (broken) tool_calls - recovery_assistant = self._build_assistant_message(assistant_message, finish_reason) - messages.append(recovery_assistant) - - # Respond with tool error results for each tool call - invalid_names = {name for name, _ in invalid_json_args} - for tc in assistant_message.tool_calls: - if tc.function.name in invalid_names: - err = next(e for n, e in invalid_json_args if n == tc.function.name) - tool_result = ( - f"Error: Invalid JSON arguments. {err}. " - f"For tools with no required parameters, use an empty object: {{}}. " - f"Please retry with valid JSON." - ) - else: - tool_result = "Skipped: other tool call in this response had invalid JSON." - messages.append({ - "role": "tool", - "name": tc.function.name, - "tool_call_id": tc.id, - "content": tool_result, - }) - continue - - # Reset retry counter on successful JSON validation - self._invalid_json_retries = 0 - - # ── Post-call guardrails ────────────────────────── - assistant_message.tool_calls = self._cap_delegate_task_calls( - assistant_message.tool_calls - ) - assistant_message.tool_calls = self._deduplicate_tool_calls( - assistant_message.tool_calls - ) - - assistant_msg = self._build_assistant_message(assistant_message, finish_reason) - - # If this turn has both content AND tool_calls, capture the content - # as a fallback final response. Common pattern: model delivers its - # answer and calls memory/skill tools as a side-effect in the same - # turn. If the follow-up turn after tools is empty, we use this. - turn_content = assistant_message.content or "" - if turn_content and self._has_content_after_think_block(turn_content): - self._last_content_with_tools = turn_content - # Only mute subsequent output when EVERY tool call in - # this turn is post-response housekeeping (memory, todo, - # skill_manage, etc.). If any substantive tool is present - # (search_files, read_file, write_file, terminal, ...), - # keep output visible so the user sees progress. - _HOUSEKEEPING_TOOLS = frozenset({ - "memory", "todo", "skill_manage", "session_search", - }) - _all_housekeeping = all( - tc.function.name in _HOUSEKEEPING_TOOLS - for tc in assistant_message.tool_calls - ) - self._last_content_tools_all_housekeeping = _all_housekeeping - if _all_housekeeping and self._has_stream_consumers(): - self._mute_post_response = True - elif self._should_emit_quiet_tool_messages(): - clean = self._strip_think_blocks(turn_content).strip() - if clean: - self._vprint(f" ┊ 💬 {clean}") - - # Pop thinking-only prefill message(s) before appending - # (tool-call path — same rationale as the final-response path). - _had_prefill = False - while ( - messages - and isinstance(messages[-1], dict) - and messages[-1].get("_thinking_prefill") - ): - messages.pop() - _had_prefill = True - - # Reset prefill counter when tool calls follow a prefill - # recovery. Without this, the counter accumulates across - # the whole conversation — a model that intermittently - # empties (empty → prefill → tools → empty → prefill → - # tools) burns both prefill attempts and the third empty - # gets zero recovery. Resetting here treats each tool- - # call success as a fresh start. - if _had_prefill: - self._thinking_prefill_retries = 0 - self._empty_content_retries = 0 - # Successful tool execution — reset the post-tool nudge - # flag so it can fire again if the model goes empty on - # a LATER tool round. - self._post_tool_empty_retried = False - - messages.append(assistant_msg) - self._emit_interim_assistant_message(assistant_msg) - - # Close any open streaming display (response box, reasoning - # box) before tool execution begins. Intermediate turns may - # have streamed early content that opened the response box; - # flushing here prevents it from wrapping tool feed lines. - # Only signal the display callback — TTS (_stream_callback) - # should NOT receive None (it uses None as end-of-stream). - if self.stream_delta_callback: - try: - self.stream_delta_callback(None) - except Exception: - pass - - self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count) - - if self._tool_guardrail_halt_decision is not None: - decision = self._tool_guardrail_halt_decision - _turn_exit_reason = "guardrail_halt" - final_response = self._toolguard_controlled_halt_response(decision) - self._emit_status( - f"⚠️ Tool guardrail halted {decision.tool_name}: {decision.code}" - ) - messages.append({"role": "assistant", "content": final_response}) - break - - # Reset per-turn retry counters after successful tool - # execution so a single truncation doesn't poison the - # entire conversation. - truncated_tool_call_retries = 0 - - # Signal that a paragraph break is needed before the next - # streamed text. We don't emit it immediately because - # multiple consecutive tool iterations would stack up - # redundant blank lines. Instead, _fire_stream_delta() - # will prepend a single "\n\n" the next time real text - # arrives. - self._stream_needs_break = True - - # Refund the iteration if the ONLY tool(s) called were - # execute_code (programmatic tool calling). These are - # cheap RPC-style calls that shouldn't eat the budget. - _tc_names = {tc.function.name for tc in assistant_message.tool_calls} - if _tc_names == {"execute_code"}: - self.iteration_budget.refund() - - # Use real token counts from the API response to decide - # compression. prompt_tokens + completion_tokens is the - # actual context size the provider reported plus the - # assistant turn — a tight lower bound for the next prompt. - # Tool results appended above aren't counted yet, but the - # threshold (default 50%) leaves ample headroom; if tool - # results push past it, the next API call will report the - # real total and trigger compression then. - # - # If last_prompt_tokens is 0 (stale after API disconnect - # or provider returned no usage data), fall back to rough - # estimate to avoid missing compression. Without this, - # a session can grow unbounded after disconnects because - # should_compress(0) never fires. (#2153) - _compressor = self.context_compressor - if _compressor.last_prompt_tokens > 0: - # Only use prompt_tokens — completion/reasoning - # tokens don't consume context window space. - # Thinking models (GLM-5.1, QwQ, DeepSeek R1) - # inflate completion_tokens with reasoning, - # causing premature compression. (#12026) - _real_tokens = _compressor.last_prompt_tokens - else: - # Include tool schemas — with 50+ tools enabled - # these add 20-30K tokens the messages-only - # estimate misses, which can skip compression - # past the configured threshold (#14695). - _real_tokens = estimate_request_tokens_rough( - messages, tools=self.tools or None - ) - - if self.compression_enabled and _compressor.should_compress(_real_tokens): - self._safe_print(" ⟳ compacting context…") - messages, active_system_prompt = self._compress_context( - messages, system_message, - approx_tokens=self.context_compressor.last_prompt_tokens, - task_id=effective_task_id, - ) - # Compression created a new session — clear history so - # _flush_messages_to_session_db writes compressed messages - # to the new session (see preflight compression comment). - conversation_history = None - - # Save session log incrementally (so progress is visible even if interrupted) - self._session_messages = messages - self._save_session_log(messages) - - # Continue loop for next response - continue - - else: - # No tool calls - this is the final response - final_response = assistant_message.content or "" - - # Fix: unmute output when entering the no-tool-call branch - # so the user can see empty-response warnings and recovery - # status messages. _mute_post_response was set during a - # prior housekeeping tool turn and should not silence the - # final response path. - self._mute_post_response = False - - # Check if response only has think block with no actual content after it - if not self._has_content_after_think_block(final_response): - # ── Partial stream recovery ───────────────────── - # If content was already streamed to the user before - # the connection died, use it as the final response - # instead of falling through to prior-turn fallback - # or wasting API calls on retries. - _partial_streamed = ( - getattr(self, "_current_streamed_assistant_text", "") or "" - ) - if self._has_content_after_think_block(_partial_streamed): - _turn_exit_reason = "partial_stream_recovery" - _recovered = self._strip_think_blocks(_partial_streamed).strip() - logger.info( - "Partial stream content delivered (%d chars) " - "— using as final response", - len(_recovered), - ) - self._emit_status( - "↻ Stream interrupted — using delivered content " - "as final response" - ) - final_response = _recovered - self._response_was_previewed = True - break - - # If the previous turn already delivered real content alongside - # HOUSEKEEPING tool calls (e.g. "You're welcome!" + memory save), - # the model has nothing more to say. Use the earlier content - # immediately instead of wasting API calls on retries. - # NOTE: Only use this shortcut when ALL tools in that turn were - # housekeeping (memory, todo, etc.). When substantive tools - # were called (terminal, search_files, etc.), the content was - # likely mid-task narration ("I'll scan the directory...") and - # the empty follow-up means the model choked — let the - # post-tool nudge below handle that instead of exiting early. - fallback = getattr(self, '_last_content_with_tools', None) - if fallback and getattr(self, '_last_content_tools_all_housekeeping', False): - _turn_exit_reason = "fallback_prior_turn_content" - logger.info("Empty follow-up after tool calls — using prior turn content as final response") - self._emit_status("↻ Empty response after tool calls — using earlier content as final answer") - self._last_content_with_tools = None - self._last_content_tools_all_housekeeping = False - self._empty_content_retries = 0 - # Do NOT modify the assistant message content — the - # old code injected "Calling the X tools..." which - # poisoned the conversation history. Just use the - # fallback text as the final response and break. - final_response = self._strip_think_blocks(fallback).strip() - self._response_was_previewed = True - break - - # ── Post-tool-call empty response nudge ─────────── - # The model returned empty after executing tool calls. - # This covers two cases: - # (a) No prior-turn content at all — model went silent - # (b) Prior turn had content + SUBSTANTIVE tools (the - # fallback above was skipped because the content - # was mid-task narration, not a final answer) - # Instead of giving up, nudge the model to continue by - # appending a user-level hint. This is the #9400 case: - # weaker models (mimo-v2-pro, GLM-5, etc.) sometimes - # return empty after tool results instead of continuing - # to the next step. One retry with a nudge usually - # fixes it. - _prior_was_tool = any( - m.get("role") == "tool" - for m in messages[-5:] # check recent messages - ) - # Detect Qwen3/Ollama-style in-content thinking blocks. - # Ollama puts in the content field (not in - # reasoning_content), so _has_structured below would - # miss it. We check here so thinking-only responses - # after tool calls route to prefill instead of nudge. - _has_inline_thinking = bool( - re.search( - r'||', - final_response or "", - re.IGNORECASE, - ) - ) - if ( - _prior_was_tool - and not getattr(self, "_post_tool_empty_retried", False) - and not _has_inline_thinking # thinking model still working — let prefill handle - ): - self._post_tool_empty_retried = True - # Clear stale narration so it doesn't resurface - # on a later empty response after the nudge. - self._last_content_with_tools = None - self._last_content_tools_all_housekeeping = False - logger.info( - "Empty response after tool calls — nudging model " - "to continue processing" - ) - self._emit_status( - "⚠️ Model returned empty after tool calls — " - "nudging to continue" - ) - # Append the empty assistant message first so the - # message sequence stays valid: - # tool(result) → assistant("(empty)") → user(nudge) - # Without this, we'd have tool → user which most - # APIs reject as an invalid sequence. - _nudge_msg = self._build_assistant_message(assistant_message, finish_reason) - _nudge_msg["content"] = "(empty)" - _nudge_msg["_empty_recovery_synthetic"] = True - messages.append(_nudge_msg) - messages.append({ - "role": "user", - "content": ( - "You just executed tool calls but returned an " - "empty response. Please process the tool " - "results above and continue with the task." - ), - "_empty_recovery_synthetic": True, - }) - continue - - # ── Thinking-only prefill continuation ────────── - # The model produced structured reasoning (via API - # fields) but no visible text content. Rather than - # giving up, append the assistant message as-is and - # continue — the model will see its own reasoning - # on the next turn and produce the text portion. - # Inspired by clawdbot's "incomplete-text" recovery. - # Also covers Qwen3/Ollama in-content blocks - # (detected above as _has_inline_thinking). - _has_structured = bool( - getattr(assistant_message, "reasoning", None) - or getattr(assistant_message, "reasoning_content", None) - or getattr(assistant_message, "reasoning_details", None) - or _has_inline_thinking - ) - if _has_structured and self._thinking_prefill_retries < 2: - self._thinking_prefill_retries += 1 - logger.info( - "Thinking-only response (no visible content) — " - "prefilling to continue (%d/2)", - self._thinking_prefill_retries, - ) - self._emit_status( - f"↻ Thinking-only response — prefilling to continue " - f"({self._thinking_prefill_retries}/2)" - ) - interim_msg = self._build_assistant_message( - assistant_message, "incomplete" - ) - interim_msg["_thinking_prefill"] = True - messages.append(interim_msg) - self._session_messages = messages - self._save_session_log(messages) - continue - - # ── Empty response retry ────────────────────── - # Model returned nothing usable. Retry up to 3 - # times before attempting fallback. This covers - # both truly empty responses (no content, no - # reasoning) AND reasoning-only responses after - # prefill exhaustion — models like mimo-v2-pro - # always populate reasoning fields via OpenRouter, - # so the old `not _has_structured` guard blocked - # retries for every reasoning model after prefill. - _truly_empty = not self._strip_think_blocks( - final_response - ).strip() - _prefill_exhausted = ( - _has_structured - and self._thinking_prefill_retries >= 2 - ) - if _truly_empty and (not _has_structured or _prefill_exhausted) and self._empty_content_retries < 3: - self._empty_content_retries += 1 - logger.warning( - "Empty response (no content or reasoning) — " - "retry %d/3 (model=%s)", - self._empty_content_retries, self.model, - ) - self._emit_status( - f"⚠️ Empty response from model — retrying " - f"({self._empty_content_retries}/3)" - ) - continue - - # ── Exhausted retries — try fallback provider ── - # Before giving up with "(empty)", attempt to - # switch to the next provider in the fallback - # chain. This covers the case where a model - # (e.g. GLM-4.5-Air) consistently returns empty - # due to context degradation or provider issues. - if _truly_empty and self._fallback_chain: - logger.warning( - "Empty response after %d retries — " - "attempting fallback (model=%s, provider=%s)", - self._empty_content_retries, self.model, - self.provider, - ) - self._emit_status( - "⚠️ Model returning empty responses — " - "switching to fallback provider..." - ) - if self._try_activate_fallback(): - self._empty_content_retries = 0 - self._emit_status( - f"↻ Switched to fallback: {self.model} " - f"({self.provider})" - ) - logger.info( - "Fallback activated after empty responses: " - "now using %s on %s", - self.model, self.provider, - ) - continue - - # Exhausted retries and fallback chain (or no - # fallback configured). Fall through to the - # "(empty)" terminal. - _turn_exit_reason = "empty_response_exhausted" - reasoning_text = self._extract_reasoning(assistant_message) - self._drop_trailing_empty_response_scaffolding(messages) - assistant_msg = self._build_assistant_message(assistant_message, finish_reason) - assistant_msg["content"] = "(empty)" - # This is a user-facing failure sentinel for the gateway, - # not real assistant content. Persisting it makes later - # "continue" turns replay assistant("(empty)") as if it - # were a meaningful model response, which can keep long - # tool-heavy sessions stuck in empty-response loops. - assistant_msg["_empty_terminal_sentinel"] = True - messages.append(assistant_msg) - - if reasoning_text: - reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text - logger.warning( - "Reasoning-only response (no visible content) " - "after exhausting retries and fallback. " - "Reasoning: %s", reasoning_preview, - ) - self._emit_status( - "⚠️ Model produced reasoning but no visible " - "response after all retries. Returning empty." - ) - else: - logger.warning( - "Empty response (no content or reasoning) " - "after %d retries. No fallback available. " - "model=%s provider=%s", - self._empty_content_retries, self.model, - self.provider, - ) - self._emit_status( - "❌ Model returned no content after all retries" - + (" and fallback attempts." if self._fallback_chain else - ". No fallback providers configured.") - ) - - final_response = "(empty)" - break - - # Reset retry counter/signature on successful content - self._empty_content_retries = 0 - self._thinking_prefill_retries = 0 - - if ( - self.api_mode == "codex_responses" - and self.valid_tool_names - and codex_ack_continuations < 2 - and self._looks_like_codex_intermediate_ack( - user_message=user_message, - assistant_content=final_response, - messages=messages, - ) - ): - codex_ack_continuations += 1 - interim_msg = self._build_assistant_message(assistant_message, "incomplete") - messages.append(interim_msg) - self._emit_interim_assistant_message(interim_msg) - - continue_msg = { - "role": "user", - "content": ( - "[System: Continue now. Execute the required tool calls and only " - "send your final answer after completing the task.]" - ), - } - messages.append(continue_msg) - self._session_messages = messages - self._save_session_log(messages) - continue - - codex_ack_continuations = 0 - - if truncated_response_parts: - final_response = "".join(truncated_response_parts) + final_response - truncated_response_parts = [] - length_continue_retries = 0 - - final_response = self._strip_think_blocks(final_response).strip() - - final_msg = self._build_assistant_message(assistant_message, finish_reason) - - # Pop thinking-only prefill and empty-response retry - # scaffolding before appending the final response. These - # internal turns are only for the next API retry and should - # not become durable transcript context. - while ( - messages - and isinstance(messages[-1], dict) - and ( - messages[-1].get("_thinking_prefill") - or messages[-1].get("_empty_recovery_synthetic") - or messages[-1].get("_empty_terminal_sentinel") - ) - ): - messages.pop() - - messages.append(final_msg) - - _turn_exit_reason = f"text_response(finish_reason={finish_reason})" - if not self.quiet_mode: - self._safe_print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)") - break - - except Exception as e: - error_msg = f"Error during OpenAI-compatible API call #{api_call_count}: {str(e)}" - try: - print(f"❌ {error_msg}") - except (OSError, ValueError): - logger.error(error_msg) - - logger.debug("Outer loop error in API call #%d", api_call_count, exc_info=True) - - # If an assistant message with tool_calls was already appended, - # the API expects a role="tool" result for every tool_call_id. - # Fill in error results for any that weren't answered yet. - for idx in range(len(messages) - 1, -1, -1): - msg = messages[idx] - if not isinstance(msg, dict): - break - if msg.get("role") == "tool": - continue - if msg.get("role") == "assistant" and msg.get("tool_calls"): - answered_ids = { - m["tool_call_id"] - for m in messages[idx + 1:] - if isinstance(m, dict) and m.get("role") == "tool" - } - for tc in msg["tool_calls"]: - if not tc or not isinstance(tc, dict): continue - if tc["id"] not in answered_ids: - err_msg = { - "role": "tool", - "name": AIAgent._get_tool_call_name_static(tc), - "tool_call_id": tc["id"], - "content": f"Error executing tool: {error_msg}", - } - messages.append(err_msg) - break - - # Non-tool errors don't need a synthetic message injected. - # The error is already printed to the user (line above), and - # the retry loop continues. Injecting a fake user/assistant - # message pollutes history, burns tokens, and risks violating - # role-alternation invariants. - - # If we're near the limit, break to avoid infinite loops - if api_call_count >= self.max_iterations - 1: - _turn_exit_reason = f"error_near_max_iterations({error_msg[:80]})" - final_response = f"I apologize, but I encountered repeated errors: {error_msg}" - # Append as assistant so the history stays valid for - # session resume (avoids consecutive user messages). - messages.append({"role": "assistant", "content": final_response}) - break - - if final_response is None and ( - api_call_count >= self.max_iterations - or self.iteration_budget.remaining <= 0 - ): - # Budget exhausted — ask the model for a summary via one extra - # API call with tools stripped. _handle_max_iterations injects a - # user message and makes a single toolless request. - _turn_exit_reason = f"max_iterations_reached({api_call_count}/{self.max_iterations})" - self._emit_status( - f"⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) " - "— asking model to summarise" - ) - if not self.quiet_mode: - self._safe_print( - f"\n⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) " - "— requesting summary..." - ) - final_response = self._handle_max_iterations(messages, api_call_count) - - # If running as a kanban worker, block the task so the dispatcher - # knows the worker could not complete (rather than treating it as a - # protocol violation). The agent loop strips tools before calling - # _handle_max_iterations, so the model cannot call kanban_block - # itself — we must do it on its behalf. - _kanban_task = os.environ.get("HERMES_KANBAN_TASK") - if _kanban_task: - try: - handle_function_call( - "kanban_block", - { - "task_id": _kanban_task, - "reason": ( - f"Iteration budget exhausted " - f"({api_call_count}/{self.max_iterations}) — " - "task could not complete within the allowed " - "iterations" - ), - }, - task_id=effective_task_id, - ) - logger.info( - "kanban_block called for task %s after iteration " - "exhaustion (%d/%d)", - _kanban_task, api_call_count, self.max_iterations, - ) - except Exception: - logger.warning( - "Failed to call kanban_block after iteration " - "exhaustion for task %s", - _kanban_task, - exc_info=True, - ) - - # Determine if conversation completed successfully - completed = final_response is not None and api_call_count < self.max_iterations - - # Save trajectory if enabled. ``user_message`` may be a multimodal - # list of parts; the trajectory format wants a plain string. - self._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed) - - # Clean up VM and browser for this task after conversation completes - self._cleanup_task_resources(effective_task_id) - - # Persist session to both JSON log and SQLite only after private retry - # scaffolding has been removed. Otherwise a later user "continue" turn - # can replay assistant("(empty)") / recovery nudges and fall into the - # same empty-response loop again. - self._drop_trailing_empty_response_scaffolding(messages) - self._persist_session(messages, conversation_history) - - # ── Turn-exit diagnostic log ───────────────────────────────────── - # Always logged at INFO so agent.log captures WHY every turn ended. - # When the last message is a tool result (agent was mid-work), log - # at WARNING — this is the "just stops" scenario users report. - _last_msg_role = messages[-1].get("role") if messages else None - _last_tool_name = None - if _last_msg_role == "tool": - # Walk back to find the assistant message with the tool call - for _m in reversed(messages): - if _m.get("role") == "assistant" and _m.get("tool_calls"): - _tcs = _m["tool_calls"] - if _tcs and isinstance(_tcs[0], dict): - _last_tool_name = _tcs[-1].get("function", {}).get("name") - break - - _turn_tool_count = sum( - 1 for m in messages - if isinstance(m, dict) and m.get("role") == "assistant" and m.get("tool_calls") - ) - _resp_len = len(final_response) if final_response else 0 - _budget_used = self.iteration_budget.used if self.iteration_budget else 0 - _budget_max = self.iteration_budget.max_total if self.iteration_budget else 0 - - _diag_msg = ( - "Turn ended: reason=%s model=%s api_calls=%d/%d budget=%d/%d " - "tool_turns=%d last_msg_role=%s response_len=%d session=%s" - ) - _diag_args = ( - _turn_exit_reason, self.model, api_call_count, self.max_iterations, - _budget_used, _budget_max, - _turn_tool_count, _last_msg_role, _resp_len, - self.session_id or "none", - ) - - if _last_msg_role == "tool" and not interrupted: - # Agent was mid-work — this is the "just stops" case. - logger.warning( - "Turn ended with pending tool result (agent may appear stuck). " - + _diag_msg + " last_tool=%s", - *_diag_args, _last_tool_name, - ) - else: - logger.info(_diag_msg, *_diag_args) - - # File-mutation verifier footer. - # If one or more ``write_file`` / ``patch`` calls failed during this - # turn and were never superseded by a successful write to the same - # path, append an advisory footer to the assistant response. This - # catches the specific case — reported by Ben Eng (#15524-adjacent) - # — where a model issues a batch of parallel patches, half of them - # fail with "Could not find old_string", and the model summarises - # the turn claiming every file was edited. The user then has to - # manually run ``git status`` to catch the lie. With this footer - # the truth is surfaced on every turn, so over-claiming is - # structurally impossible past the model. - # - # Gate: only applied when a real text response exists for this - # turn and the user didn't interrupt. Empty/interrupted turns - # already have other surface text that shouldn't be augmented. - if final_response and not interrupted: - try: - _failed = getattr(self, "_turn_failed_file_mutations", None) or {} - if _failed and self._file_mutation_verifier_enabled(): - footer = self._format_file_mutation_failure_footer(_failed) - if footer: - final_response = final_response.rstrip() + "\n\n" + footer - except Exception as _ver_err: - logger.debug("file-mutation verifier footer failed: %s", _ver_err) - - # Plugin hook: transform_llm_output - # Fired once per turn after the tool-calling loop completes. - # Plugins can transform the LLM's output text before it's returned. - # First hook to return a string wins; None/empty return leaves text unchanged. - if final_response and not interrupted: - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _transform_results = _invoke_hook( - "transform_llm_output", - response_text=final_response, - session_id=self.session_id or "", - model=self.model, - platform=getattr(self, "platform", None) or "", - ) - for _hook_result in _transform_results: - if isinstance(_hook_result, str) and _hook_result: - final_response = _hook_result - break # First non-empty string wins - except Exception as exc: - logger.warning("transform_llm_output hook failed: %s", exc) - - # Plugin hook: post_llm_call - # Fired once per turn after the tool-calling loop completes. - # Plugins can use this to persist conversation data (e.g. sync - # to an external memory system). - if final_response and not interrupted: - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _invoke_hook( - "post_llm_call", - session_id=self.session_id, - user_message=original_user_message, - assistant_response=final_response, - conversation_history=list(messages), - model=self.model, - platform=getattr(self, "platform", None) or "", - ) - except Exception as exc: - logger.warning("post_llm_call hook failed: %s", exc) - - # Extract reasoning from the CURRENT turn only. Walk backwards - # but stop at the user message that started this turn — anything - # earlier is from a prior turn and must not leak into the reasoning - # box (confusing stale display; #17055). Within the current turn - # we still want the *most recent* non-empty reasoning: many - # providers (Claude thinking, DeepSeek v4, Codex Responses) emit - # reasoning on the tool-call step and leave the final-answer step - # with reasoning=None, so picking only the last assistant would - # silently drop legitimate same-turn reasoning. - last_reasoning = None - for msg in reversed(messages): - if msg.get("role") == "user": - break # turn boundary — don't cross into prior turns - if msg.get("role") == "assistant" and msg.get("reasoning"): - last_reasoning = msg["reasoning"] - break - - # Build result with interrupt info if applicable - result = { - "final_response": final_response, - "last_reasoning": last_reasoning, - "messages": messages, - "api_calls": api_call_count, - "completed": completed, - "turn_exit_reason": _turn_exit_reason, - "partial": False, # True only when stopped due to invalid tool calls - "interrupted": interrupted, - "response_previewed": getattr(self, "_response_was_previewed", False), - "model": self.model, - "provider": self.provider, - "base_url": self.base_url, - "input_tokens": self.session_input_tokens, - "output_tokens": self.session_output_tokens, - "cache_read_tokens": self.session_cache_read_tokens, - "cache_write_tokens": self.session_cache_write_tokens, - "reasoning_tokens": self.session_reasoning_tokens, - "prompt_tokens": self.session_prompt_tokens, - "completion_tokens": self.session_completion_tokens, - "total_tokens": self.session_total_tokens, - "last_prompt_tokens": getattr(self.context_compressor, "last_prompt_tokens", 0) or 0, - "estimated_cost_usd": self.session_estimated_cost_usd, - "cost_status": self.session_cost_status, - "cost_source": self.session_cost_source, - } - if self._tool_guardrail_halt_decision is not None: - result["guardrail"] = self._tool_guardrail_halt_decision.to_metadata() - # If a /steer landed after the final assistant turn (no more tool - # batches to drain into), hand it back to the caller so it can be - # delivered as the next user turn instead of being silently lost. - _leftover_steer = self._drain_pending_steer() - if _leftover_steer: - result["pending_steer"] = _leftover_steer - self._response_was_previewed = False - - # Include interrupt message if one triggered the interrupt - if interrupted and self._interrupt_message: - result["interrupt_message"] = self._interrupt_message - - # Clear interrupt state after handling - self.clear_interrupt() - - # Clear stream callback so it doesn't leak into future calls - self._stream_callback = None - - # Check skill trigger NOW — based on how many tool iterations THIS turn used. - _should_review_skills = False - if (self._skill_nudge_interval > 0 - and self._iters_since_skill >= self._skill_nudge_interval - and "skill_manage" in self.valid_tool_names): - _should_review_skills = True - self._iters_since_skill = 0 - - # External memory provider: sync the completed turn + queue next prefetch. - self._sync_external_memory_for_turn( - original_user_message=original_user_message, - final_response=final_response, - interrupted=interrupted, - ) - - # Background memory/skill review — runs AFTER the response is delivered - # so it never competes with the user's task for model attention. - if final_response and not interrupted and (_should_review_memory or _should_review_skills): - try: - self._spawn_background_review( - messages_snapshot=list(messages), - review_memory=_should_review_memory, - review_skills=_should_review_skills, - ) - except Exception: - pass # Background review is best-effort - - # Note: Memory provider on_session_end() + shutdown_all() are NOT - # called here — run_conversation() is called once per user message in - # multi-turn sessions. Shutting down after every turn would kill the - # provider before the second message. Actual session-end cleanup is - # handled by the CLI (atexit / /reset) and gateway (session expiry / - # _reset_session). - - # Plugin hook: on_session_end - # Fired at the very end of every run_conversation call. - # Plugins can use this for cleanup, flushing buffers, etc. - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _invoke_hook( - "on_session_end", - session_id=self.session_id, - completed=completed, - interrupted=interrupted, - model=self.model, - platform=getattr(self, "platform", None) or "", - ) - except Exception as exc: - logger.warning("on_session_end hook failed: %s", exc) - - return result + """Forwarder — see ``agent.conversation_loop.run_conversation``.""" + from agent.conversation_loop import run_conversation + return run_conversation(self, user_message, system_message, conversation_history, task_id, stream_callback, persist_user_message) def chat(self, message: str, stream_callback: Optional[callable] = None) -> str: """ @@ -16103,144 +3871,9 @@ class AIAgent: effective_task_id: str, should_review_memory: bool = False, ) -> Dict[str, Any]: - """Codex app-server runtime path. Hands the entire turn to a `codex - app-server` subprocess and projects its events back into Hermes' - messages list so memory/skill review keep working. - - Called from run_conversation() when self.api_mode == "codex_app_server". - Returns the same dict shape as the chat_completions path. - """ - from agent.transports.codex_app_server_session import CodexAppServerSession - - # Lazy session: one CodexAppServerSession per AIAgent instance. - # Spawned on first turn, reused across turns, closed at AIAgent - # shutdown (see _cleanup hook). - if not hasattr(self, "_codex_session") or self._codex_session is None: - cwd = getattr(self, "session_cwd", None) or os.getcwd() - # Approval callback: defer to Hermes' standard prompt flow if a - # CLI thread has installed one. Gateway / cron contexts get the - # codex-side fail-closed default. - try: - from tools.terminal_tool import _get_approval_callback - approval_callback = _get_approval_callback() - except Exception: - approval_callback = None - self._codex_session = CodexAppServerSession( - cwd=cwd, - approval_callback=approval_callback, - ) - - # NOTE: the user message is ALREADY appended to messages by the - # standard run_conversation() flow (line ~11823) before the early - # return reaches us. Do NOT append again — that would duplicate. - - try: - turn = self._codex_session.run_turn(user_input=user_message) - except Exception as exc: - logger.exception("codex app-server turn failed") - # Crash → unconditionally drop the session so the next turn - # respawns from scratch instead of reusing a dead client. - try: - self._codex_session.close() - except Exception: - pass - self._codex_session = None - return { - "final_response": ( - f"Codex app-server turn failed: {exc}. " - f"Fall back to default runtime with `/codex-runtime auto`." - ), - "messages": messages, - "api_calls": 0, - "completed": False, - "partial": True, - "error": str(exc), - } - - # If the turn signalled the underlying client is wedged (deadline - # blown, post-tool watchdog tripped, OAuth refresh died, subprocess - # exited), retire the session so the next turn respawns codex - # rather than riding the broken process. Mirrors openclaw beta.8's - # "retire timed-out app-server clients" fix. - if getattr(turn, "should_retire", False): - logger.warning( - "codex app-server session retired (turn error: %s)", - turn.error, - ) - try: - self._codex_session.close() - except Exception: - pass - self._codex_session = None - - # Splice projected messages into the conversation. The projector emits - # standard {role, content, tool_calls, tool_call_id} entries, which - # is exactly what curator.py / sessions DB expect. - if turn.projected_messages: - messages.extend(turn.projected_messages) - - # Counter ticks for the self-improvement loop. - # _turns_since_memory and _user_turn_count are ALREADY incremented - # in the run_conversation() pre-loop block (lines ~11793-11817) so we - # do NOT touch them here — that would double-count. - # Only _iters_since_skill needs explicit increment, since the - # chat_completions loop bumps it per tool iteration (line ~12110) - # and that loop is bypassed on this path. - self._iters_since_skill = ( - getattr(self, "_iters_since_skill", 0) + turn.tool_iterations - ) - - # Now check the skill nudge AFTER iters were incremented — same - # pattern the chat_completions path uses (line ~15432). - should_review_skills = False - if ( - self._skill_nudge_interval > 0 - and self._iters_since_skill >= self._skill_nudge_interval - and "skill_manage" in self.valid_tool_names - ): - should_review_skills = True - self._iters_since_skill = 0 - - # External memory provider sync (mirrors line ~15439). Skipped on - # interrupt/error to avoid feeding partial transcripts to memory. - if not turn.interrupted and turn.error is None: - try: - self._sync_external_memory_for_turn( - original_user_message=original_user_message, - final_response=turn.final_text, - interrupted=False, - ) - except Exception: - logger.debug("external memory sync raised", exc_info=True) - - # Background review fork — same cadence + signature as the default - # path (line ~15449). Only fires when a trigger actually tripped AND - # we have a real final response. - if ( - turn.final_text - and not turn.interrupted - and (should_review_memory or should_review_skills) - ): - try: - self._spawn_background_review( - messages_snapshot=list(messages), - review_memory=should_review_memory, - review_skills=should_review_skills, - ) - except Exception: - logger.debug("background review spawn raised", exc_info=True) - - return { - "final_response": turn.final_text, - "messages": messages, - "api_calls": 1, # one app-server "turn" maps to one logical API call - "completed": not turn.interrupted and turn.error is None, - "partial": turn.interrupted or turn.error is not None, - "error": turn.error, - "codex_thread_id": turn.thread_id, - "codex_turn_id": turn.turn_id, - } - + """Forwarder — see ``agent.codex_runtime.run_codex_app_server_turn``.""" + from agent.codex_runtime import run_codex_app_server_turn + return run_codex_app_server_turn(self, user_message=user_message, original_user_message=original_user_message, messages=messages, effective_task_id=effective_task_id, should_review_memory=should_review_memory) def main( query: str = None, diff --git a/tests/run_agent/test_jsondecodeerror_retryable.py b/tests/run_agent/test_jsondecodeerror_retryable.py index 201521ddb22..0bd4fc09f9f 100644 --- a/tests/run_agent/test_jsondecodeerror_retryable.py +++ b/tests/run_agent/test_jsondecodeerror_retryable.py @@ -73,15 +73,20 @@ class TestAgentLoopSourceStillHasCarveOut: revert that happens to leave the test file intact.""" def test_run_agent_excludes_jsondecodeerror_from_local_validation(self): - import run_agent import inspect - src = inspect.getsource(run_agent) + from agent import conversation_loop + # The agent loop body lives in agent/conversation_loop.py after + # the run_agent.py refactor. Assert the carve-out is present in + # the extracted module specifically — if it ever moves back or + # disappears, this fails loudly rather than silently passing + # against a non-existent inline replica. + src = inspect.getsource(conversation_loop) # The predicate we care about must reference json.JSONDecodeError # in its exclusion tuple. We check for the specific co-occurrence # rather than the literal string so harmless reformatting doesn't # break us. assert "is_local_validation_error" in src assert "JSONDecodeError" in src, ( - "run_agent.py must carve out json.JSONDecodeError from the " - "is_local_validation_error classification — see #14782." + "agent/conversation_loop.py must carve out json.JSONDecodeError " + "from the is_local_validation_error classification — see #14782." ) diff --git a/tests/run_agent/test_memory_nudge_counter_hydration.py b/tests/run_agent/test_memory_nudge_counter_hydration.py index abf97d265a6..1b9bf56005d 100644 --- a/tests/run_agent/test_memory_nudge_counter_hydration.py +++ b/tests/run_agent/test_memory_nudge_counter_hydration.py @@ -120,10 +120,22 @@ def test_production_code_contains_hydration_block(): """Smoke test: confirm the hydration code is actually wired into run_conversation(). If someone deletes it, tests above still pass against the inline replica — this fails them awake. + + After the run_agent.py refactor the agent-loop body lives in + ``agent/conversation_loop.py`` and uses ``agent.X`` rather than + ``self.X``. Assert the block is present in the extracted module + specifically — if it ever drifts back into run_agent.py or + disappears entirely, this guard fails loudly. """ from pathlib import Path - src = Path(__file__).resolve().parents[2] / "run_agent.py" - content = src.read_text(encoding="utf-8") + repo = Path(__file__).resolve().parents[2] + cl_path = repo / "agent" / "conversation_loop.py" + src_cl = cl_path.read_text(encoding="utf-8") # Anchor on the unique comment + the modulo line. - assert "Hydrate per-session nudge counters from persisted history" in content - assert "self._turns_since_memory = prior_user_turns % self._memory_nudge_interval" in content + assert "Hydrate per-session nudge counters from persisted history" in src_cl, ( + f"Hydration comment missing from {cl_path}" + ) + assert ( + "agent._turns_since_memory = prior_user_turns % agent._memory_nudge_interval" + in src_cl + ), f"Hydration modulo assignment missing from {cl_path}" diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 8d56ff6425a..11b58e5faa1 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -4879,23 +4879,26 @@ class TestAnthropicInterruptHandler: def test_interruptible_has_anthropic_branch(self): """The interrupt handler must check api_mode == 'anthropic_messages'.""" import inspect - source = inspect.getsource(AIAgent._interruptible_api_call) + from agent.chat_completion_helpers import interruptible_api_call + source = inspect.getsource(interruptible_api_call) assert "anthropic_messages" in source, \ - "_interruptible_api_call must handle Anthropic interrupt (api_mode check)" + "interruptible_api_call must handle Anthropic interrupt (api_mode check)" def test_interruptible_rebuilds_anthropic_client(self): """After interrupting, the Anthropic client should be rebuilt.""" import inspect - source = inspect.getsource(AIAgent._interruptible_api_call) + from agent.chat_completion_helpers import interruptible_api_call + source = inspect.getsource(interruptible_api_call) assert "build_anthropic_client" in source, \ - "_interruptible_api_call must rebuild Anthropic client after interrupt" + "interruptible_api_call must rebuild Anthropic client after interrupt" def test_streaming_has_anthropic_branch(self): """_streaming_api_call must also handle Anthropic interrupt.""" import inspect - source = inspect.getsource(AIAgent._interruptible_streaming_api_call) + from agent.chat_completion_helpers import interruptible_streaming_api_call + source = inspect.getsource(interruptible_streaming_api_call) assert "anthropic_messages" in source, \ - "_streaming_api_call must handle Anthropic interrupt" + "interruptible_streaming_api_call must handle Anthropic interrupt" # --------------------------------------------------------------------------- @@ -5304,14 +5307,20 @@ class TestMemoryNudgeCounterPersistence: def test_counters_not_reset_in_preamble(self): """The run_conversation preamble must not zero the nudge counters.""" import inspect - src = inspect.getsource(AIAgent.run_conversation) + from agent.conversation_loop import run_conversation as _rc + src = inspect.getsource(_rc) # The preamble resets many fields (retry counts, budget, etc.) # before the main loop. Find that reset block and verify our # counters aren't in it. The reset block ends at iteration_budget. - preamble_end = src.index("self.iteration_budget = IterationBudget") + # The extracted body uses ``agent.X`` (not ``self.X``). Anchor + # exactly on ``agent.iteration_budget = IterationBudget`` so an + # unrelated identifier ending in ``iteration_budget`` (e.g. + # ``_iteration_budget`` or ``shared_iteration_budget``) can't + # match the boundary. + preamble_end = src.index("agent.iteration_budget = IterationBudget") preamble = src[:preamble_end] - assert "self._turns_since_memory = 0" not in preamble - assert "self._iters_since_skill = 0" not in preamble + assert "agent._turns_since_memory = 0" not in preamble + assert "agent._iters_since_skill = 0" not in preamble class TestDeadRetryCode: @@ -5319,7 +5328,8 @@ class TestDeadRetryCode: def test_no_unreachable_max_retries_after_backoff(self): import inspect - source = inspect.getsource(AIAgent.run_conversation) + from agent.conversation_loop import run_conversation as _rc + source = inspect.getsource(_rc) occurrences = source.count("if retry_count >= max_retries:") assert occurrences == 2, ( f"Expected 2 occurrences of 'if retry_count >= max_retries:' " @@ -5357,7 +5367,8 @@ class TestMemoryContextSanitization: a literal tag we don't silently delete their text. The streaming scrubber + plugin-side scrub cover real leak paths.""" import inspect - src = inspect.getsource(AIAgent.run_conversation) + from agent.conversation_loop import run_conversation as _rc + src = inspect.getsource(_rc) assert "sanitize_context(user_message)" not in src assert "sanitize_context(persist_user_message)" not in src @@ -5393,7 +5404,8 @@ class TestMemoryProviderTurnStart: def test_on_turn_start_called_before_prefetch(self): """Source-level check: on_turn_start appears before prefetch_all in run_conversation.""" import inspect - src = inspect.getsource(AIAgent.run_conversation) + from agent.conversation_loop import run_conversation as _rc + src = inspect.getsource(_rc) # Find the actual method calls, not comments idx_turn_start = src.index(".on_turn_start(") idx_prefetch = src.index(".prefetch_all(") @@ -5403,7 +5415,10 @@ class TestMemoryProviderTurnStart: ) def test_on_turn_start_uses_user_turn_count(self): - """Source-level check: on_turn_start receives self._user_turn_count.""" + """Source-level check: on_turn_start receives the user_turn_count.""" import inspect - src = inspect.getsource(AIAgent.run_conversation) - assert "on_turn_start(self._user_turn_count" in src + from agent.conversation_loop import run_conversation as _rc + src = inspect.getsource(_rc) + # The extracted body uses ``agent.X`` rather than ``self.X``; + # assert the extracted-form spelling directly. + assert "on_turn_start(agent._user_turn_count" in src diff --git a/tests/run_agent/test_tool_executor_contextvar_propagation.py b/tests/run_agent/test_tool_executor_contextvar_propagation.py index 652ecf05def..2e1d543705a 100644 --- a/tests/run_agent/test_tool_executor_contextvar_propagation.py +++ b/tests/run_agent/test_tool_executor_contextvar_propagation.py @@ -152,19 +152,28 @@ def test_run_agent_concurrent_executor_wraps_submit_with_copy_context(): import inspect import run_agent + from agent import tool_executor as tool_executor_module - src_path = inspect.getsourcefile(run_agent) - assert src_path is not None - tree = ast.parse(open(src_path, encoding="utf-8").read()) + # Source for both modules — the concurrent-executor body lives in + # ``agent/tool_executor.py`` after the run_agent.py refactor (PR + # following #16660). Search both so this guard keeps firing + # regardless of where the call site lives. + sources = [] + for mod in (run_agent, tool_executor_module): + src_path = inspect.getsourcefile(mod) + assert src_path is not None + sources.append((src_path, open(src_path, encoding="utf-8").read())) submit_calls_in_agent: list[ast.Call] = [] - for node in ast.walk(tree): - if not isinstance(node, ast.Call): - continue - func = node.func - # Match executor.submit(...) style calls. - if isinstance(func, ast.Attribute) and func.attr == "submit": - submit_calls_in_agent.append(node) + for _src_path, src_text in sources: + tree = ast.parse(src_text) + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + func = node.func + # Match executor.submit(...) style calls. + if isinstance(func, ast.Attribute) and func.attr == "submit": + submit_calls_in_agent.append(node) # Filter to the submit call inside the concurrent tool executor — # identifiable by passing `_run_tool` as its target. Other submit()