hermes-agent/run_agent.py
teknium1 d35ee7bcdd
refactor(run_agent): move review prompts to agent/background_review.py
The three big review-prompt strings (_MEMORY_REVIEW_PROMPT,
_SKILL_REVIEW_PROMPT, _COMBINED_REVIEW_PROMPT — 183 lines combined) move
out of the AIAgent class body and into agent/background_review.py where
they're consumed.

AIAgent re-exposes them as class attributes via 'from ... import' inside
the class body — Python binds those names into the class namespace so
existing AIAgent._MEMORY_REVIEW_PROMPT references keep working.
spawn_background_review_thread also falls back to the module-level
constants if an agent doesn't have the attribute (preserves the test
pattern of mocking these on the agent).

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure).

run_agent.py: 9986 -> 9800 lines (-186).
2026-05-16 19:11:58 -07:00

9808 lines
495 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
AI Agent Runner with Tool Calling
This module provides a clean, standalone agent that can execute AI models
with tool calling capabilities. It handles the conversation loop, tool execution,
and response management.
Features:
- Automatic tool calling loop until completion
- Configurable model parameters
- Error handling and recovery
- Message history management
- Support for multiple model providers
Usage:
from run_agent import AIAgent
agent = AIAgent(base_url="http://localhost:30000/v1", model="claude-opus-4-20250514")
response = agent.run_conversation("Tell me about the latest Python updates")
"""
# IMPORTANT: hermes_bootstrap must be the very first import — UTF-8 stdio
# on Windows. No-op on POSIX. See hermes_bootstrap.py for full rationale.
try:
import hermes_bootstrap # noqa: F401
except ModuleNotFoundError:
# Graceful fallback when hermes_bootstrap isn't registered in the venv
# yet — happens during partial ``hermes update`` where git-reset landed
# new code but ``uv pip install -e .`` didn't finish. Missing bootstrap
# means UTF-8 stdio setup is skipped on Windows; POSIX is unaffected.
pass
import asyncio
import base64
import concurrent.futures
import contextvars
import copy
import hashlib
import json
import logging
logger = logging.getLogger(__name__)
import os
import random
import re
import ssl
import sys
import tempfile
import time
import threading
from types import SimpleNamespace
import urllib.request
import uuid
from typing import List, Dict, Any, Optional
from urllib.parse import urlparse, parse_qs, urlunparse
# NOTE: `from openai import OpenAI` is deliberately NOT at module top — the
# SDK pulls ~240 ms of imports. We expose `OpenAI` as a thin proxy object
# that imports the SDK on first call/isinstance check. This preserves:
# (a) the single in-module `OpenAI(**client_kwargs)` call site at
# _create_openai_client, and
# (b) `patch("run_agent.OpenAI", ...)` test patterns used by ~28 test files.
#
# NOTE: `fire` is ONLY used in the `__main__` block below (for running
# run_agent.py directly as a CLI) — it is NOT needed for library usage.
# It is imported there, not here, so that importing run_agent from a
# daemon thread (e.g. curator's forked review agent) never fails with
# ModuleNotFoundError on broken/partial installs where `fire` isn't present.
from datetime import datetime
from pathlib import Path
from hermes_constants import get_hermes_home
# OpenAI lazy proxy + safe stdio + proxy URL helpers — see agent/process_bootstrap.py.
# `OpenAI` is re-exported here so `patch("run_agent.OpenAI", ...)` in tests works.
from agent.process_bootstrap import (
OpenAI,
_OpenAIProxy,
_load_openai_cls,
_SafeWriter,
_install_safe_stdio,
_get_proxy_from_env,
_get_proxy_for_base_url,
)
from agent.iteration_budget import IterationBudget
from hermes_cli.env_loader import load_hermes_dotenv
from hermes_cli.timeouts import (
get_provider_request_timeout,
get_provider_stale_timeout,
)
_hermes_home = get_hermes_home()
_project_env = Path(__file__).parent / '.env'
_loaded_env_paths = load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env)
if _loaded_env_paths:
for _env_path in _loaded_env_paths:
logger.info("Loaded environment variables from %s", _env_path)
else:
logger.info("No .env file found. Using system environment variables.")
# Import our tool system
from model_tools import (
get_tool_definitions,
get_toolset_for_tool,
handle_function_call,
check_toolset_requirements,
)
from tools.terminal_tool import cleanup_vm, get_active_env, is_persistent_env
from tools.terminal_tool import (
set_approval_callback as _set_approval_callback,
set_sudo_password_callback as _set_sudo_password_callback,
_get_approval_callback,
_get_sudo_password_callback,
)
from tools.tool_result_storage import maybe_persist_tool_result, enforce_turn_budget
from tools.interrupt import set_interrupt as _set_interrupt
from tools.browser_tool import cleanup_browser
# Agent internals extracted to agent/ package for modularity
from agent.memory_manager import StreamingContextScrubber, build_memory_context_block, sanitize_context
from agent.think_scrubber import StreamingThinkScrubber
from agent.retry_utils import jittered_backoff
from agent.error_classifier import classify_api_error, FailoverReason
from agent.prompt_builder import (
DEFAULT_AGENT_IDENTITY, PLATFORM_HINTS,
MEMORY_GUIDANCE, SESSION_SEARCH_GUIDANCE, SKILLS_GUIDANCE,
HERMES_AGENT_HELP_GUIDANCE,
KANBAN_GUIDANCE,
build_nous_subscription_prompt,
)
from agent.model_metadata import (
fetch_model_metadata,
estimate_tokens_rough, estimate_messages_tokens_rough, estimate_request_tokens_rough,
get_next_probe_tier, parse_context_limit_from_error,
parse_available_output_tokens_from_error,
save_context_length, is_local_endpoint,
query_ollama_num_ctx,
)
from agent.context_compressor import ContextCompressor
from agent.subdirectory_hints import SubdirectoryHintTracker
from agent.prompt_caching import apply_anthropic_cache_control
from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt, build_environment_hints, load_soul_md, TOOL_USE_ENFORCEMENT_GUIDANCE, TOOL_USE_ENFORCEMENT_MODELS, GOOGLE_MODEL_OPERATIONAL_GUIDANCE, OPENAI_MODEL_EXECUTION_GUIDANCE
from agent.usage_pricing import estimate_usage_cost, normalize_usage
from agent.codex_responses_adapter import (
_derive_responses_function_call_id as _codex_derive_responses_function_call_id,
_deterministic_call_id as _codex_deterministic_call_id,
_split_responses_tool_id as _codex_split_responses_tool_id,
_summarize_user_message_for_log,
)
from agent.display import (
KawaiiSpinner, build_tool_preview as _build_tool_preview,
get_cute_tool_message as _get_cute_tool_message_impl,
_detect_tool_failure,
get_tool_emoji as _get_tool_emoji,
)
from agent.tool_guardrails import (
ToolCallGuardrailConfig,
ToolCallGuardrailController,
ToolGuardrailDecision,
append_toolguard_guidance,
toolguard_synthetic_result,
)
from agent.tool_result_classification import (
FILE_MUTATING_TOOL_NAMES as _FILE_MUTATING_TOOLS,
file_mutation_result_landed,
)
from agent.trajectory import (
convert_scratchpad_to_think, has_incomplete_scratchpad,
save_trajectory as _save_trajectory_to_file,
)
from agent.message_sanitization import (
_SURROGATE_RE,
_sanitize_surrogates,
_sanitize_structure_surrogates,
_sanitize_messages_surrogates,
_escape_invalid_chars_in_json_strings,
_repair_tool_call_arguments,
_strip_non_ascii,
_sanitize_messages_non_ascii,
_sanitize_tools_non_ascii,
_strip_images_from_messages,
_sanitize_structure_non_ascii,
)
from agent.tool_dispatch_helpers import (
_NEVER_PARALLEL_TOOLS,
_PARALLEL_SAFE_TOOLS,
_PATH_SCOPED_TOOLS,
_DESTRUCTIVE_PATTERNS,
_REDIRECT_OVERWRITE,
_is_destructive_command,
_should_parallelize_tool_batch,
_extract_parallel_scope_path,
_paths_overlap,
_is_multimodal_tool_result,
_multimodal_text_summary,
_append_subdir_hint_to_multimodal,
_extract_file_mutation_targets,
_extract_error_preview,
_trajectory_normalize_msg,
)
from utils import atomic_json_write, base_url_host_matches, base_url_hostname, env_var_enabled, normalize_proxy_url
from hermes_cli.config import cfg_get
_MAX_TOOL_WORKERS = 8
# Guard so the OpenRouter metadata pre-warm thread is only spawned once per
# process, not once per AIAgent instantiation. Without this, long-running
# gateway processes leak one OS thread per incoming message and eventually
# exhaust the system thread limit (RuntimeError: can't start new thread).
_openrouter_prewarm_done = threading.Event()
# =========================================================================
# Large tool result handler — save oversized output to temp file
# =========================================================================
# =========================================================================
# Qwen Portal headers — mimics QwenCode CLI for portal.qwen.ai compatibility.
# Extracted as a module-level helper so both __init__ and
# _apply_client_headers_for_base_url can share it.
# =========================================================================
_QWEN_CODE_VERSION = "0.14.1"
def _routermint_headers() -> dict:
"""Return the User-Agent RouterMint needs to avoid Cloudflare 1010 blocks."""
from hermes_cli import __version__ as _HERMES_VERSION
return {
"User-Agent": f"HermesAgent/{_HERMES_VERSION}",
}
def _pool_may_recover_from_rate_limit(
pool, *, provider: str | None = None, base_url: str | None = None
) -> bool:
"""Decide whether to wait for credential-pool rotation instead of falling back.
The existing pool-rotation path requires the pool to (1) exist and (2) have
at least one entry not currently in exhaustion cooldown. But rotation is
only meaningful when the pool has more than one entry.
With a single-credential pool (common for Gemini OAuth, Vertex service
accounts, and any "one personal key" configuration), the primary entry
just 429'd and there is nothing to rotate to. Waiting for the pool
cooldown to expire means retrying against the same exhausted quota — the
daily-quota 429 will recur immediately, and the retry budget is burned.
Additionally, Google CloudCode / Gemini CLI rate limits are ACCOUNT-level
throttles — even a multi-entry pool shares the same quota window, so
rotation won't recover. Skip straight to the fallback for those (#13636).
In those cases we must fall back to the configured ``fallback_model``
instead. Returns True only when rotation has somewhere to go.
See issues #11314 and #13636.
"""
if pool is None:
return False
if not pool.has_available():
return False
# CloudCode / Gemini CLI quotas are account-wide — all pool entries share
# the same throttle window, so rotation can't recover. Prefer fallback.
if provider == "google-gemini-cli" or str(base_url or "").startswith("cloudcode-pa://"):
return False
return len(pool.entries()) > 1
def _qwen_portal_headers() -> dict:
"""Return default HTTP headers required by Qwen Portal API."""
import platform as _plat
_ua = f"QwenCode/{_QWEN_CODE_VERSION} ({_plat.system().lower()}; {_plat.machine()})"
return {
"User-Agent": _ua,
"X-DashScope-CacheControl": "enable",
"X-DashScope-UserAgent": _ua,
"X-DashScope-AuthType": "qwen-oauth",
}
class AIAgent:
"""
AI Agent with tool calling capabilities.
This class manages the conversation flow, tool execution, and response handling
for AI models that support function calling.
"""
_TOOL_CALL_ARGUMENTS_CORRUPTION_MARKER = (
"[hermes-agent: tool call arguments were corrupted in this session and "
"have been dropped to keep the conversation alive. See issue #15236.]"
)
@property
def base_url(self) -> str:
return self._base_url
@base_url.setter
def base_url(self, value: str) -> None:
self._base_url = value
self._base_url_lower = value.lower() if value else ""
self._base_url_hostname = base_url_hostname(value)
def __init__(
self,
base_url: str = None,
api_key: str = None,
provider: str = None,
api_mode: str = None,
acp_command: str = None,
acp_args: list[str] | None = None,
command: str = None,
args: list[str] | None = None,
model: str = "",
max_iterations: int = 90, # Default tool-calling iterations (shared with subagents)
tool_delay: float = 1.0,
enabled_toolsets: List[str] = None,
disabled_toolsets: List[str] = None,
save_trajectories: bool = False,
verbose_logging: bool = False,
quiet_mode: bool = False,
ephemeral_system_prompt: str = None,
log_prefix_chars: int = 100,
log_prefix: str = "",
providers_allowed: List[str] = None,
providers_ignored: List[str] = None,
providers_order: List[str] = None,
provider_sort: str = None,
provider_require_parameters: bool = False,
provider_data_collection: str = None,
openrouter_min_coding_score: Optional[float] = None,
session_id: str = None,
tool_progress_callback: callable = None,
tool_start_callback: callable = None,
tool_complete_callback: callable = None,
thinking_callback: callable = None,
reasoning_callback: callable = None,
clarify_callback: callable = None,
step_callback: callable = None,
stream_delta_callback: callable = None,
interim_assistant_callback: callable = None,
tool_gen_callback: callable = None,
status_callback: callable = None,
max_tokens: int = None,
reasoning_config: Dict[str, Any] = None,
service_tier: str = None,
request_overrides: Dict[str, Any] = None,
prefill_messages: List[Dict[str, Any]] = None,
platform: str = None,
user_id: str = None,
user_name: str = None,
chat_id: str = None,
chat_name: str = None,
chat_type: str = None,
thread_id: str = None,
gateway_session_key: str = None,
skip_context_files: bool = False,
load_soul_identity: bool = False,
skip_memory: bool = False,
session_db=None,
parent_session_id: str = None,
iteration_budget: "IterationBudget" = None,
fallback_model: Dict[str, Any] = None,
credential_pool=None,
checkpoints_enabled: bool = False,
checkpoint_max_snapshots: int = 20,
checkpoint_max_total_size_mb: int = 500,
checkpoint_max_file_size_mb: int = 10,
pass_session_id: bool = False,
):
"""
Initialize the AI Agent.
Args:
base_url (str): Base URL for the model API (optional)
api_key (str): API key for authentication (optional, uses env var if not provided)
provider (str): Provider identifier (optional; used for telemetry/routing hints)
api_mode (str): API mode override: "chat_completions" or "codex_responses"
model (str): Model name to use (default: "anthropic/claude-opus-4.6")
max_iterations (int): Maximum number of tool calling iterations (default: 90)
tool_delay (float): Delay between tool calls in seconds (default: 1.0)
enabled_toolsets (List[str]): Only enable tools from these toolsets (optional)
disabled_toolsets (List[str]): Disable tools from these toolsets (optional)
save_trajectories (bool): Whether to save conversation trajectories to JSONL files (default: False)
verbose_logging (bool): Enable verbose logging for debugging (default: False)
quiet_mode (bool): Suppress progress output for clean CLI experience (default: False)
ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 100)
log_prefix (str): Prefix to add to all log messages for identification in parallel processing (default: "")
providers_allowed (List[str]): OpenRouter providers to allow (optional)
providers_ignored (List[str]): OpenRouter providers to ignore (optional)
providers_order (List[str]): OpenRouter providers to try in order (optional)
provider_sort (str): Sort providers by price/throughput/latency (optional)
openrouter_min_coding_score (float): Coding-score floor (0.0-1.0) for the
openrouter/pareto-code router. Only applied when model == "openrouter/pareto-code".
None or empty = let OpenRouter pick the strongest available coder.
session_id (str): Pre-generated session ID for logging (optional, auto-generated if not provided)
tool_progress_callback (callable): Callback function(tool_name, args_preview) for progress notifications
clarify_callback (callable): Callback function(question, choices) -> str for interactive user questions.
Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error.
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking).
If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning.
prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context.
Useful for injecting a few-shot example or priming the model's response style.
Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}]
NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a conversation that ends on an
assistant-role message (400 error). For those models use structured outputs or
output_config.format instead of a trailing-assistant prefill.
platform (str): The interface platform the user is on (e.g. "cli", "telegram", "discord", "whatsapp").
Used to inject platform-specific formatting hints into the system prompt.
skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules
into the system prompt. Use this for batch processing and data generation to avoid
polluting trajectories with user-specific persona or project instructions.
load_soul_identity (bool): If True, still use ~/.hermes/SOUL.md as the primary
identity even when skip_context_files=True. Project context files from the cwd
remain skipped.
"""
_install_safe_stdio()
self.model = model
self.max_iterations = max_iterations
# Shared iteration budget — parent creates, children inherit.
# Consumed by every LLM turn across parent + all subagents.
self.iteration_budget = iteration_budget or IterationBudget(max_iterations)
self.tool_delay = tool_delay
self.save_trajectories = save_trajectories
self.verbose_logging = verbose_logging
self.quiet_mode = quiet_mode
self.ephemeral_system_prompt = ephemeral_system_prompt
self.platform = platform # "cli", "telegram", "discord", "whatsapp", etc.
self._user_id = user_id # Platform user identifier (gateway sessions)
self._user_name = user_name
self._chat_id = chat_id
self._chat_name = chat_name
self._chat_type = chat_type
self._thread_id = thread_id
self._gateway_session_key = gateway_session_key # Stable per-chat key (e.g. agent:main:telegram:dm:123)
# Pluggable print function — CLI replaces this with _cprint so that
# raw ANSI status lines are routed through prompt_toolkit's renderer
# instead of going directly to stdout where patch_stdout's StdoutProxy
# would mangle the escape sequences. None = use builtins.print.
self._print_fn = None
self.background_review_callback = None # Optional sync callback for gateway delivery
self.skip_context_files = skip_context_files
self.load_soul_identity = load_soul_identity
self.pass_session_id = pass_session_id
self._credential_pool = credential_pool
self.log_prefix_chars = log_prefix_chars
self.log_prefix = f"{log_prefix} " if log_prefix else ""
# Store effective base URL for feature detection (prompt caching, reasoning, etc.)
self.base_url = base_url or ""
provider_name = provider.strip().lower() if isinstance(provider, str) and provider.strip() else None
self.provider = provider_name or ""
self.acp_command = acp_command or command
self.acp_args = list(acp_args or args or [])
if api_mode in {"chat_completions", "codex_responses", "anthropic_messages", "bedrock_converse", "codex_app_server"}:
self.api_mode = api_mode
elif self.provider == "openai-codex":
self.api_mode = "codex_responses"
elif self.provider == "xai":
self.api_mode = "codex_responses"
elif (provider_name is None) and (
self._base_url_hostname == "chatgpt.com"
and "/backend-api/codex" in self._base_url_lower
):
self.api_mode = "codex_responses"
self.provider = "openai-codex"
elif (provider_name is None) and self._base_url_hostname == "api.x.ai":
self.api_mode = "codex_responses"
self.provider = "xai"
elif self.provider == "anthropic" or (provider_name is None and self._base_url_hostname == "api.anthropic.com"):
self.api_mode = "anthropic_messages"
self.provider = "anthropic"
elif self._base_url_lower.rstrip("/").endswith("/anthropic"):
# Third-party Anthropic-compatible endpoints (e.g. MiniMax, DashScope)
# use a URL convention ending in /anthropic. Auto-detect these so the
# Anthropic Messages API adapter is used instead of chat completions.
self.api_mode = "anthropic_messages"
elif self.provider == "bedrock" or (
self._base_url_hostname.startswith("bedrock-runtime.")
and base_url_host_matches(self._base_url_lower, "amazonaws.com")
):
# AWS Bedrock — auto-detect from provider name or base URL
# (bedrock-runtime.<region>.amazonaws.com).
self.api_mode = "bedrock_converse"
else:
self.api_mode = "chat_completions"
# Eagerly warm the transport cache so import errors surface at init,
# not mid-conversation. Also validates the api_mode is registered.
try:
self._get_transport()
except Exception:
pass # Non-fatal — transport may not exist for all modes yet
try:
from hermes_cli.model_normalize import (
_AGGREGATOR_PROVIDERS,
normalize_model_for_provider,
)
if self.provider not in _AGGREGATOR_PROVIDERS:
self.model = normalize_model_for_provider(self.model, self.provider)
except Exception:
pass
# GPT-5.x models usually require the Responses API path, but some
# providers have exceptions (for example Copilot's gpt-5-mini still
# uses chat completions). Also auto-upgrade for direct OpenAI URLs
# (api.openai.com) since all newer tool-calling models prefer
# Responses there. ACP runtimes are excluded: CopilotACPClient
# handles its own routing and does not implement the Responses API
# surface.
# When api_mode was explicitly provided, respect it — the user
# knows what their endpoint supports (#10473).
# Exception: Azure OpenAI serves gpt-5.x on /chat/completions and
# does NOT support the Responses API — skip the upgrade for Azure
# (openai.azure.com), even though it looks OpenAI-compatible.
if (
api_mode is None
and self.api_mode == "chat_completions"
and self.provider != "copilot-acp"
and not str(self.base_url or "").lower().startswith("acp://copilot")
and not str(self.base_url or "").lower().startswith("acp+tcp://")
and not self._is_azure_openai_url()
and (
self._is_direct_openai_url()
or self._provider_model_requires_responses_api(
self.model,
provider=self.provider,
)
)
):
self.api_mode = "codex_responses"
# Invalidate the eager-warmed transport cache — api_mode changed
# from chat_completions to codex_responses after the warm at __init__.
if hasattr(self, "_transport_cache"):
self._transport_cache.clear()
# Pre-warm OpenRouter model metadata cache in a background thread.
# fetch_model_metadata() is cached for 1 hour; this avoids a blocking
# HTTP request on the first API response when pricing is estimated.
# Use a process-level Event so this thread is only spawned once — a new
# AIAgent is created for every gateway request, so without the guard
# each message leaks one OS thread and the process eventually exhausts
# the system thread limit (RuntimeError: can't start new thread).
if (self.provider == "openrouter" or self._is_openrouter_url()) and \
not _openrouter_prewarm_done.is_set():
_openrouter_prewarm_done.set()
threading.Thread(
target=fetch_model_metadata,
daemon=True,
name="openrouter-prewarm",
).start()
self.tool_progress_callback = tool_progress_callback
self.tool_start_callback = tool_start_callback
self.tool_complete_callback = tool_complete_callback
self.suppress_status_output = False
self.thinking_callback = thinking_callback
self.reasoning_callback = reasoning_callback
self.clarify_callback = clarify_callback
self.step_callback = step_callback
self.stream_delta_callback = stream_delta_callback
self.interim_assistant_callback = interim_assistant_callback
self.status_callback = status_callback
self.tool_gen_callback = tool_gen_callback
# Tool execution state — allows _vprint during tool execution
# even when stream consumers are registered (no tokens streaming then)
self._executing_tools = False
self._tool_guardrails = ToolCallGuardrailController()
self._tool_guardrail_halt_decision: ToolGuardrailDecision | None = None
# Interrupt mechanism for breaking out of tool loops
self._interrupt_requested = False
self._interrupt_message = None # Optional message that triggered interrupt
self._execution_thread_id: int | None = None # Set at run_conversation() start
self._interrupt_thread_signal_pending = False
self._client_lock = threading.RLock()
# /steer mechanism — inject a user note into the next tool result
# without interrupting the agent. Unlike interrupt(), steer() does
# NOT set _interrupt_requested; it waits for the current tool batch
# to finish naturally, then the drain hook appends the text to the
# last tool result's content so the model sees it on its next
# iteration. Message-role alternation is preserved (we modify an
# existing tool message rather than inserting a new user turn).
self._pending_steer: Optional[str] = None
self._pending_steer_lock = threading.Lock()
# Concurrent-tool worker thread tracking. `_execute_tool_calls_concurrent`
# runs each tool on its own ThreadPoolExecutor worker — those worker
# threads have tids distinct from `_execution_thread_id`, so
# `_set_interrupt(True, _execution_thread_id)` alone does NOT cause
# `is_interrupted()` inside the worker to return True. Track the
# workers here so `interrupt()` / `clear_interrupt()` can fan out to
# their tids explicitly.
self._tool_worker_threads: set[int] = set()
self._tool_worker_threads_lock = threading.Lock()
# Subagent delegation state
self._delegate_depth = 0 # 0 = top-level agent, incremented for children
self._active_children = [] # Running child AIAgents (for interrupt propagation)
self._active_children_lock = threading.Lock()
# Store OpenRouter provider preferences
self.providers_allowed = providers_allowed
self.providers_ignored = providers_ignored
self.providers_order = providers_order
self.provider_sort = provider_sort
self.provider_require_parameters = provider_require_parameters
self.provider_data_collection = provider_data_collection
self.openrouter_min_coding_score = openrouter_min_coding_score
# Store toolset filtering options
self.enabled_toolsets = enabled_toolsets
self.disabled_toolsets = disabled_toolsets
# Model response configuration
self.max_tokens = max_tokens # None = use model default
self.reasoning_config = reasoning_config # None = use default (medium for OpenRouter)
self.service_tier = service_tier
self.request_overrides = dict(request_overrides or {})
self.prefill_messages = prefill_messages or [] # Prefilled conversation turns
self._force_ascii_payload = False
# Anthropic prompt caching: auto-enabled for Claude models on native
# Anthropic, OpenRouter, and third-party gateways that speak the
# Anthropic protocol (``api_mode == 'anthropic_messages'``). Reduces
# input costs by ~75% on multi-turn conversations. Uses system_and_3
# strategy (4 breakpoints). See ``_anthropic_prompt_cache_policy``
# for the layout-vs-transport decision.
self._use_prompt_caching, self._use_native_cache_layout = (
self._anthropic_prompt_cache_policy()
)
# Anthropic supports "5m" (default) and "1h" cache TTL tiers. Read from
# config.yaml under prompt_caching.cache_ttl; unknown values keep "5m".
# 1h tier costs 2x on write vs 1.25x for 5m, but amortizes across long
# sessions with >5-minute pauses between turns (#14971).
self._cache_ttl = "5m"
try:
from hermes_cli.config import load_config as _load_pc_cfg
_pc_cfg = _load_pc_cfg().get("prompt_caching", {}) or {}
_ttl = _pc_cfg.get("cache_ttl", "5m")
if _ttl in {"5m", "1h"}:
self._cache_ttl = _ttl
except Exception:
pass
# Iteration budget: the LLM is only notified when it actually exhausts
# the iteration budget (api_call_count >= max_iterations). At that
# point we inject ONE message, allow one final API call, and if the
# model doesn't produce a text response, force a user-message asking
# it to summarise. No intermediate pressure warnings — they caused
# models to "give up" prematurely on complex tasks (#7915).
self._budget_exhausted_injected = False
self._budget_grace_call = False
# Activity tracking — updated on each API call, tool execution, and
# stream chunk. Used by the gateway timeout handler to report what the
# agent was doing when it was killed, and by the "still working"
# notifications to show progress.
self._last_activity_ts: float = time.time()
self._last_activity_desc: str = "initializing"
self._current_tool: str | None = None
self._api_call_count: int = 0
# Rate limit tracking — updated from x-ratelimit-* response headers
# after each API call. Accessed by /usage slash command.
self._rate_limit_state: Optional["RateLimitState"] = None
# OpenRouter response cache hit counter — incremented when
# X-OpenRouter-Cache-Status: HIT is seen in streaming response headers.
self._or_cache_hits: int = 0
# Centralized logging — agent.log (INFO+) and errors.log (WARNING+)
# both live under ~/.hermes/logs/. Idempotent, so gateway mode
# (which creates a new AIAgent per message) won't duplicate handlers.
from hermes_logging import setup_logging, setup_verbose_logging
setup_logging(hermes_home=_hermes_home)
if self.verbose_logging:
setup_verbose_logging()
logger.info("Verbose logging enabled (third-party library logs suppressed)")
elif self.quiet_mode:
# In quiet mode (CLI default), keep console output clean —
# but DO NOT raise per-logger levels. Doing so prevents the
# root logger's file handlers (agent.log, errors.log) from
# ever seeing the records, because Python checks
# logger.isEnabledFor() before handler propagation. We rely
# on the fact that hermes_logging.setup_logging() does not
# install a console StreamHandler in quiet mode — so INFO
# records flow to the file handlers but never reach a
# console. Any future noise reduction belongs at the
# handler level inside hermes_logging.py, not here.
pass
# Internal stream callback (set during streaming TTS).
# Initialized here so _vprint can reference it before run_conversation.
self._stream_callback = None
# Deferred paragraph break flag — set after tool iterations so a
# single "\n\n" is prepended to the next real text delta.
self._stream_needs_break = False
# Stateful scrubber for <memory-context> spans split across stream
# deltas (#5719). sanitize_context() alone can't survive chunk
# boundaries because the block regex needs both tags in one string.
self._stream_context_scrubber = StreamingContextScrubber()
# Stateful scrubber for reasoning/thinking tags in streamed deltas
# (#17924). Replaces the per-delta _strip_think_blocks regex that
# destroyed downstream state (e.g. MiniMax-M2.7 streaming
# '<think>' as delta1 and 'Let me check' as delta2 — the regex
# erased delta1, so downstream state machines never learned a
# block was open and leaked delta2 as content).
self._stream_think_scrubber = StreamingThinkScrubber()
# Visible assistant text already delivered through live token callbacks
# during the current model response. Used to avoid re-sending the same
# commentary when the provider later returns it as a completed interim
# assistant message.
self._current_streamed_assistant_text = ""
# Optional current-turn user-message override used when the API-facing
# user message intentionally differs from the persisted transcript
# (e.g. CLI voice mode adds a temporary prefix for the live call only).
self._persist_user_message_idx = None
self._persist_user_message_override = None
# Cache anthropic image-to-text fallbacks per image payload/URL so a
# single tool loop does not repeatedly re-run auxiliary vision on the
# same image history.
self._anthropic_image_fallback_cache: Dict[str, str] = {}
# Initialize LLM client via centralized provider router.
# The router handles auth resolution, base URL, headers, and
# Codex/Anthropic wrapping for all known providers.
# raw_codex=True because the main agent needs direct responses.stream()
# access for Codex Responses API streaming.
self._anthropic_client = None
self._is_anthropic_oauth = False
# Resolve per-provider / per-model request timeout once up front so
# every client construction path below (Anthropic native, OpenAI-wire,
# router-based implicit auth) can apply it consistently. Bedrock
# Claude uses its own timeout path and is not covered here.
_provider_timeout = get_provider_request_timeout(self.provider, self.model)
if self.api_mode == "anthropic_messages":
from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token
# Bedrock + Claude → use AnthropicBedrock SDK for full feature parity
# (prompt caching, thinking budgets, adaptive thinking).
_is_bedrock_anthropic = self.provider == "bedrock"
if _is_bedrock_anthropic:
from agent.anthropic_adapter import build_anthropic_bedrock_client
_region_match = re.search(r"bedrock-runtime\.([a-z0-9-]+)\.", base_url or "")
_br_region = _region_match.group(1) if _region_match else "us-east-1"
self._bedrock_region = _br_region
self._anthropic_client = build_anthropic_bedrock_client(_br_region)
self._anthropic_api_key = "aws-sdk"
self._anthropic_base_url = base_url
self._is_anthropic_oauth = False
self.api_key = "aws-sdk"
self.client = None
self._client_kwargs = {}
if not self.quiet_mode:
print(f"🤖 AI Agent initialized with model: {self.model} (AWS Bedrock + AnthropicBedrock SDK, {_br_region})")
else:
# Only fall back to ANTHROPIC_TOKEN when the provider is actually Anthropic.
# Other anthropic_messages providers (MiniMax, Alibaba, etc.) must use their own API key.
# Falling back would send Anthropic credentials to third-party endpoints (Fixes #1739, #minimax-401).
_is_native_anthropic = self.provider == "anthropic"
effective_key = (api_key or resolve_anthropic_token() or "") if _is_native_anthropic else (api_key or "")
self.api_key = effective_key
self._anthropic_api_key = effective_key
self._anthropic_base_url = base_url
# Only mark the session as OAuth-authenticated when the token
# genuinely belongs to native Anthropic. Third-party providers
# (MiniMax, Kimi, GLM, LiteLLM proxies) that accept the
# Anthropic protocol must never trip OAuth code paths — doing
# so injects Claude-Code identity headers and system prompts
# that cause 401/403 on their endpoints. Guards #1739 and
# the third-party identity-injection bug.
from agent.anthropic_adapter import _is_oauth_token as _is_oat
self._is_anthropic_oauth = _is_oat(effective_key) if _is_native_anthropic else False
self._anthropic_client = build_anthropic_client(effective_key, base_url, timeout=_provider_timeout)
# No OpenAI client needed for Anthropic mode
self.client = None
self._client_kwargs = {}
if not self.quiet_mode:
print(f"🤖 AI Agent initialized with model: {self.model} (Anthropic native)")
if effective_key and len(effective_key) > 12:
print(f"🔑 Using token: {effective_key[:8]}...{effective_key[-4:]}")
elif self.api_mode == "bedrock_converse":
# AWS Bedrock — uses boto3 directly, no OpenAI client needed.
# Region is extracted from the base_url or defaults to us-east-1.
_region_match = re.search(r"bedrock-runtime\.([a-z0-9-]+)\.", base_url or "")
self._bedrock_region = _region_match.group(1) if _region_match else "us-east-1"
# Guardrail config — read from config.yaml at init time.
self._bedrock_guardrail_config = None
try:
from hermes_cli.config import load_config as _load_br_cfg
_gr = _load_br_cfg().get("bedrock", {}).get("guardrail", {})
if _gr.get("guardrail_identifier") and _gr.get("guardrail_version"):
self._bedrock_guardrail_config = {
"guardrailIdentifier": _gr["guardrail_identifier"],
"guardrailVersion": _gr["guardrail_version"],
}
if _gr.get("stream_processing_mode"):
self._bedrock_guardrail_config["streamProcessingMode"] = _gr["stream_processing_mode"]
if _gr.get("trace"):
self._bedrock_guardrail_config["trace"] = _gr["trace"]
except Exception:
pass
self.client = None
self._client_kwargs = {}
if not self.quiet_mode:
_gr_label = " + Guardrails" if self._bedrock_guardrail_config else ""
print(f"🤖 AI Agent initialized with model: {self.model} (AWS Bedrock, {self._bedrock_region}{_gr_label})")
else:
if api_key and base_url:
# Explicit credentials from CLI/gateway — construct directly.
# The runtime provider resolver already handled auth for us.
# Extract query params (e.g. Azure api-version) from base_url
# and pass via default_query to prevent loss during SDK URL
# joining (httpx drops query string when joining paths).
_parsed_url = urlparse(base_url)
if _parsed_url.query:
_clean_url = urlunparse(_parsed_url._replace(query=""))
_query_params = {
k: v[0] for k, v in parse_qs(_parsed_url.query).items()
}
client_kwargs = {
"api_key": api_key,
"base_url": _clean_url,
"default_query": _query_params,
}
else:
client_kwargs = {"api_key": api_key, "base_url": base_url}
if _provider_timeout is not None:
client_kwargs["timeout"] = _provider_timeout
if self.provider == "copilot-acp":
client_kwargs["command"] = self.acp_command
client_kwargs["args"] = self.acp_args
effective_base = base_url
if base_url_host_matches(effective_base, "openrouter.ai"):
from agent.auxiliary_client import build_or_headers
client_kwargs["default_headers"] = build_or_headers()
elif base_url_host_matches(effective_base, "api.routermint.com"):
client_kwargs["default_headers"] = _routermint_headers()
elif base_url_host_matches(effective_base, "api.githubcopilot.com"):
from hermes_cli.models import copilot_default_headers
client_kwargs["default_headers"] = copilot_default_headers()
elif base_url_host_matches(effective_base, "api.kimi.com"):
client_kwargs["default_headers"] = {
"User-Agent": "claude-code/0.1.0",
}
elif base_url_host_matches(effective_base, "portal.qwen.ai"):
client_kwargs["default_headers"] = _qwen_portal_headers()
elif base_url_host_matches(effective_base, "chatgpt.com"):
from agent.auxiliary_client import _codex_cloudflare_headers
client_kwargs["default_headers"] = _codex_cloudflare_headers(api_key)
elif "default_headers" not in client_kwargs:
# Fall back to profile.default_headers for providers that
# declare custom headers (e.g. Vercel AI Gateway attribution,
# Kimi User-Agent on non-kimi.com endpoints).
try:
from providers import get_provider_profile as _gpf
_ph = _gpf(self.provider)
if _ph and _ph.default_headers:
client_kwargs["default_headers"] = dict(_ph.default_headers)
except Exception:
pass
else:
# No explicit creds — use the centralized provider router
from agent.auxiliary_client import resolve_provider_client
_routed_client, _ = resolve_provider_client(
self.provider or "auto", model=self.model, raw_codex=True)
if _routed_client is not None:
client_kwargs = {
"api_key": _routed_client.api_key,
"base_url": str(_routed_client.base_url),
}
if _provider_timeout is not None:
client_kwargs["timeout"] = _provider_timeout
# Preserve any default_headers the router set
if hasattr(_routed_client, '_default_headers') and _routed_client._default_headers:
client_kwargs["default_headers"] = dict(_routed_client._default_headers)
else:
# When the user explicitly chose a non-OpenRouter provider
# but no credentials were found, fail fast with a clear
# message instead of silently routing through OpenRouter.
_explicit = (self.provider or "").strip().lower()
if _explicit and _explicit not in {"auto", "openrouter", "custom"}:
# Look up the actual env var name from the provider
# config — some providers use non-standard names
# (e.g. alibaba → DASHSCOPE_API_KEY, not ALIBABA_API_KEY).
_env_hint = f"{_explicit.upper()}_API_KEY"
try:
from hermes_cli.auth import PROVIDER_REGISTRY
_pcfg = PROVIDER_REGISTRY.get(_explicit)
if _pcfg and _pcfg.api_key_env_vars:
_env_hint = _pcfg.api_key_env_vars[0]
except Exception:
pass
# --- Init-time fallback (#17929) ---
_fb_entries = []
if isinstance(fallback_model, list):
_fb_entries = [
f for f in fallback_model
if isinstance(f, dict) and f.get("provider") and f.get("model")
]
elif isinstance(fallback_model, dict) and fallback_model.get("provider") and fallback_model.get("model"):
_fb_entries = [fallback_model]
_fb_resolved = False
for _fb in _fb_entries:
_fb_explicit_key = (_fb.get("api_key") or "").strip() or None
if not _fb_explicit_key:
_fb_key_env = (_fb.get("key_env") or _fb.get("api_key_env") or "").strip()
if _fb_key_env:
_fb_explicit_key = os.getenv(_fb_key_env, "").strip() or None
_fb_client, _fb_model = resolve_provider_client(
_fb["provider"], model=_fb["model"], raw_codex=True,
explicit_base_url=_fb.get("base_url"),
explicit_api_key=_fb_explicit_key,
)
if _fb_client is not None:
self.provider = _fb["provider"]
self.model = _fb_model or _fb["model"]
self._fallback_activated = True
client_kwargs = {
"api_key": _fb_client.api_key,
"base_url": str(_fb_client.base_url),
}
if _provider_timeout is not None:
client_kwargs["timeout"] = _provider_timeout
if hasattr(_fb_client, "_default_headers") and _fb_client._default_headers:
client_kwargs["default_headers"] = dict(_fb_client._default_headers)
_fb_resolved = True
break
if not _fb_resolved:
raise RuntimeError(
f"Provider '{_explicit}' is set in config.yaml but no API key "
f"was found. Set the {_env_hint} environment "
f"variable, or switch to a different provider with `hermes model`."
)
if not getattr(self, "_fallback_activated", False):
# No provider configured — reject with a clear message.
raise RuntimeError(
"No LLM provider configured. Run `hermes model` to "
"select a provider, or run `hermes setup` for first-time "
"configuration."
)
self._client_kwargs = client_kwargs # stored for rebuilding after interrupt
# Enable fine-grained tool streaming for Claude on OpenRouter.
# Without this, Anthropic buffers the entire tool call and goes
# silent for minutes while thinking — OpenRouter's upstream proxy
# times out during the silence. The beta header makes Anthropic
# stream tool call arguments token-by-token, keeping the
# connection alive.
_effective_base = str(client_kwargs.get("base_url", "")).lower()
if base_url_host_matches(_effective_base, "openrouter.ai") and "claude" in (self.model or "").lower():
headers = client_kwargs.get("default_headers") or {}
existing_beta = headers.get("x-anthropic-beta", "")
_FINE_GRAINED = "fine-grained-tool-streaming-2025-05-14"
if _FINE_GRAINED not in existing_beta:
if existing_beta:
headers["x-anthropic-beta"] = f"{existing_beta},{_FINE_GRAINED}"
else:
headers["x-anthropic-beta"] = _FINE_GRAINED
client_kwargs["default_headers"] = headers
self.api_key = client_kwargs.get("api_key", "")
self.base_url = client_kwargs.get("base_url", self.base_url)
try:
self.client = self._create_openai_client(client_kwargs, reason="agent_init", shared=True)
if not self.quiet_mode:
print(f"🤖 AI Agent initialized with model: {self.model}")
if base_url:
print(f"🔗 Using custom base URL: {base_url}")
# Always show API key info (masked) for debugging auth issues
key_used = client_kwargs.get("api_key", "none")
if key_used and key_used != "dummy-key" and len(key_used) > 12:
print(f"🔑 Using API key: {key_used[:8]}...{key_used[-4:]}")
else:
print(f"⚠️ Warning: API key appears invalid or missing (got: '{key_used[:20] if key_used else 'none'}...')")
except Exception as e:
raise RuntimeError(f"Failed to initialize OpenAI client: {e}")
# Provider fallback chain — ordered list of backup providers tried
# when the primary is exhausted (rate-limit, overload, connection
# failure). Supports both legacy single-dict ``fallback_model`` and
# new list ``fallback_providers`` format.
if isinstance(fallback_model, list):
self._fallback_chain = [
f for f in fallback_model
if isinstance(f, dict) and f.get("provider") and f.get("model")
]
elif isinstance(fallback_model, dict) and fallback_model.get("provider") and fallback_model.get("model"):
self._fallback_chain = [fallback_model]
else:
self._fallback_chain = []
self._fallback_index = 0
self._fallback_activated = getattr(self, "_fallback_activated", False)
# Legacy attribute kept for backward compat (tests, external callers)
self._fallback_model = self._fallback_chain[0] if self._fallback_chain else None
if self._fallback_chain and not self.quiet_mode:
if len(self._fallback_chain) == 1:
fb = self._fallback_chain[0]
print(f"🔄 Fallback model: {fb['model']} ({fb['provider']})")
else:
print(f"🔄 Fallback chain ({len(self._fallback_chain)} providers): " +
"".join(f"{f['model']} ({f['provider']})" for f in self._fallback_chain))
# Get available tools with filtering
self.tools = get_tool_definitions(
enabled_toolsets=enabled_toolsets,
disabled_toolsets=disabled_toolsets,
quiet_mode=self.quiet_mode,
)
# Show tool configuration and store valid tool names for validation
self.valid_tool_names = set()
if self.tools:
self.valid_tool_names = {tool["function"]["name"] for tool in self.tools}
tool_names = sorted(self.valid_tool_names)
if not self.quiet_mode:
print(f"🛠️ Loaded {len(self.tools)} tools: {', '.join(tool_names)}")
# Show filtering info if applied
if enabled_toolsets:
print(f" ✅ Enabled toolsets: {', '.join(enabled_toolsets)}")
if disabled_toolsets:
print(f" ❌ Disabled toolsets: {', '.join(disabled_toolsets)}")
elif not self.quiet_mode:
print("🛠️ No tools loaded (all tools filtered out or unavailable)")
# Check tool requirements
if self.tools and not self.quiet_mode:
requirements = check_toolset_requirements()
missing_reqs = [name for name, available in requirements.items() if not available]
if missing_reqs:
print(f"⚠️ Some tools may not work due to missing requirements: {missing_reqs}")
# Show trajectory saving status
if self.save_trajectories and not self.quiet_mode:
print("📝 Trajectory saving enabled")
# Show ephemeral system prompt status
if self.ephemeral_system_prompt and not self.quiet_mode:
prompt_preview = self.ephemeral_system_prompt[:60] + "..." if len(self.ephemeral_system_prompt) > 60 else self.ephemeral_system_prompt
print(f"🔒 Ephemeral system prompt: '{prompt_preview}' (not saved to trajectories)")
# Show prompt caching status
if self._use_prompt_caching and not self.quiet_mode:
if self._use_native_cache_layout and self.provider == "anthropic":
source = "native Anthropic"
elif self._use_native_cache_layout:
source = "Anthropic-compatible endpoint"
else:
source = "Claude via OpenRouter"
print(f"💾 Prompt caching: ENABLED ({source}, {self._cache_ttl} TTL)")
# Session logging setup - auto-save conversation trajectories for debugging
self.session_start = datetime.now()
if session_id:
# Use provided session ID (e.g., from CLI)
self.session_id = session_id
else:
# Generate a new session ID
timestamp_str = self.session_start.strftime("%Y%m%d_%H%M%S")
short_uuid = uuid.uuid4().hex[:6]
self.session_id = f"{timestamp_str}_{short_uuid}"
# Expose session ID to tools (terminal, execute_code) so agents can
# reference their own session for --resume commands, cross-session
# coordination, and logging. Uses the ContextVar system from
# session_context.py for concurrency safety (gateway runs multiple
# sessions in one process). Also writes os.environ as fallback for
# CLI mode where ContextVars aren't used.
os.environ["HERMES_SESSION_ID"] = self.session_id
try:
from gateway.session_context import _SESSION_ID
_SESSION_ID.set(self.session_id)
except Exception:
pass # CLI/test mode — ContextVar not needed
# Session logs go into ~/.hermes/sessions/ alongside gateway sessions
hermes_home = get_hermes_home()
self.logs_dir = hermes_home / "sessions"
self.logs_dir.mkdir(parents=True, exist_ok=True)
self.session_log_file = self.logs_dir / f"session_{self.session_id}.json"
# Track conversation messages for session logging
self._session_messages: List[Dict[str, Any]] = []
self._memory_write_origin = "assistant_tool"
self._memory_write_context = "foreground"
# Cached system prompt -- built once per session, only rebuilt on compression
self._cached_system_prompt: Optional[str] = None
# Filesystem checkpoint manager (transparent — not a tool)
from tools.checkpoint_manager import CheckpointManager
self._checkpoint_mgr = CheckpointManager(
enabled=checkpoints_enabled,
max_snapshots=checkpoint_max_snapshots,
max_total_size_mb=checkpoint_max_total_size_mb,
max_file_size_mb=checkpoint_max_file_size_mb,
)
# SQLite session store (optional -- provided by CLI or gateway)
self._session_db = session_db
self._parent_session_id = parent_session_id
self._last_flushed_db_idx = 0 # tracks DB-write cursor to prevent duplicate writes
self._session_db_created = False # DB row deferred to run_conversation()
self._session_init_model_config = {
"max_iterations": self.max_iterations,
"reasoning_config": reasoning_config,
"max_tokens": max_tokens,
}
# In-memory todo list for task planning (one per agent/session)
from tools.todo_tool import TodoStore
self._todo_store = TodoStore()
# Load config once for memory, skills, and compression sections
try:
from hermes_cli.config import load_config as _load_agent_config
_agent_cfg = _load_agent_config()
except Exception:
_agent_cfg = {}
try:
self._tool_guardrails = ToolCallGuardrailController(
ToolCallGuardrailConfig.from_mapping(
_agent_cfg.get("tool_loop_guardrails", {})
)
)
except Exception as _tlg_err:
logger.warning("Tool loop guardrail config ignored: %s", _tlg_err)
# Cache only the derived auxiliary compression context override that is
# needed later by the startup feasibility check. Avoid exposing a
# broad pseudo-public config object on the agent instance.
self._aux_compression_context_length_config = None
# Persistent memory (MEMORY.md + USER.md) -- loaded from disk
self._memory_store = None
self._memory_enabled = False
self._user_profile_enabled = False
self._memory_nudge_interval = 10
self._turns_since_memory = 0
self._iters_since_skill = 0
if not skip_memory:
try:
mem_config = _agent_cfg.get("memory", {})
self._memory_enabled = mem_config.get("memory_enabled", False)
self._user_profile_enabled = mem_config.get("user_profile_enabled", False)
self._memory_nudge_interval = int(mem_config.get("nudge_interval", 10))
if self._memory_enabled or self._user_profile_enabled:
from tools.memory_tool import MemoryStore
self._memory_store = MemoryStore(
memory_char_limit=mem_config.get("memory_char_limit", 2200),
user_char_limit=mem_config.get("user_char_limit", 1375),
)
self._memory_store.load_from_disk()
except Exception:
pass # Memory is optional -- don't break agent init
# Memory provider plugin (external — one at a time, alongside built-in)
# Reads memory.provider from config to select which plugin to activate.
self._memory_manager = None
if not skip_memory:
try:
_mem_provider_name = mem_config.get("provider", "") if mem_config else ""
if _mem_provider_name:
from agent.memory_manager import MemoryManager as _MemoryManager
from plugins.memory import load_memory_provider as _load_mem
self._memory_manager = _MemoryManager()
_mp = _load_mem(_mem_provider_name)
if _mp and _mp.is_available():
self._memory_manager.add_provider(_mp)
if self._memory_manager.providers:
_init_kwargs = {
"session_id": self.session_id,
"platform": platform or "cli",
"hermes_home": str(get_hermes_home()),
"agent_context": "primary",
}
# Thread session title for memory provider scoping
# (e.g. honcho uses this to derive chat-scoped session keys)
if self._session_db:
try:
_st = self._session_db.get_session_title(self.session_id)
if _st:
_init_kwargs["session_title"] = _st
except Exception:
pass
# Thread gateway user identity for per-user memory scoping
if self._user_id:
_init_kwargs["user_id"] = self._user_id
if self._user_name:
_init_kwargs["user_name"] = self._user_name
if self._chat_id:
_init_kwargs["chat_id"] = self._chat_id
if self._chat_name:
_init_kwargs["chat_name"] = self._chat_name
if self._chat_type:
_init_kwargs["chat_type"] = self._chat_type
if self._thread_id:
_init_kwargs["thread_id"] = self._thread_id
# Thread gateway session key for stable per-chat Honcho session isolation
if self._gateway_session_key:
_init_kwargs["gateway_session_key"] = self._gateway_session_key
# Profile identity for per-profile provider scoping
try:
from hermes_cli.profiles import get_active_profile_name
_profile = get_active_profile_name()
_init_kwargs["agent_identity"] = _profile
_init_kwargs["agent_workspace"] = "hermes"
except Exception:
pass
self._memory_manager.initialize_all(**_init_kwargs)
logger.info("Memory provider '%s' activated", _mem_provider_name)
else:
logger.debug("Memory provider '%s' not found or not available", _mem_provider_name)
self._memory_manager = None
except Exception as _mpe:
logger.warning("Memory provider plugin init failed: %s", _mpe)
self._memory_manager = None
# Inject memory provider tool schemas into the tool surface.
# Skip tools whose names already exist (plugins may register the
# same tools via ctx.register_tool(), which lands in self.tools
# through get_tool_definitions()). Duplicate function names cause
# 400 errors on providers that enforce unique names (e.g. Xiaomi
# MiMo via Nous Portal).
if self._memory_manager and self.tools is not None:
_existing_tool_names = {
t.get("function", {}).get("name")
for t in self.tools
if isinstance(t, dict)
}
for _schema in self._memory_manager.get_all_tool_schemas():
_tname = _schema.get("name", "")
if _tname and _tname in _existing_tool_names:
continue # already registered via plugin path
_wrapped = {"type": "function", "function": _schema}
self.tools.append(_wrapped)
if _tname:
self.valid_tool_names.add(_tname)
_existing_tool_names.add(_tname)
# Skills config: nudge interval for skill creation reminders
self._skill_nudge_interval = 10
try:
skills_config = _agent_cfg.get("skills", {})
self._skill_nudge_interval = int(skills_config.get("creation_nudge_interval", 10))
except Exception:
pass
# Tool-use enforcement config: "auto" (default — matches hardcoded
# model list), true (always), false (never), or list of substrings.
_agent_section = _agent_cfg.get("agent", {})
if not isinstance(_agent_section, dict):
_agent_section = {}
self._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto")
# App-level API retry count (wraps each model API call). Default 3,
# overridable via agent.api_max_retries in config.yaml. See #11616.
try:
_raw_api_retries = _agent_section.get("api_max_retries", 3)
_api_retries = int(_raw_api_retries)
_api_retries = max(_api_retries, 1) # 1 = no retry (single attempt)
except (TypeError, ValueError):
_api_retries = 3
self._api_max_retries = _api_retries
# Initialize context compressor for automatic context management
# Compresses conversation when approaching model's context limit
# Configuration via config.yaml (compression section)
_compression_cfg = _agent_cfg.get("compression", {})
if not isinstance(_compression_cfg, dict):
_compression_cfg = {}
compression_threshold = float(_compression_cfg.get("threshold", 0.50))
try:
from agent.auxiliary_client import _compression_threshold_for_model as _cthresh_fn
_model_cthresh = _cthresh_fn(self.model)
if _model_cthresh is not None:
compression_threshold = _model_cthresh
except Exception:
pass
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in {"true", "1", "yes"}
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20))
compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
# protect_first_n is the number of non-system messages to protect at
# the head, in addition to the system prompt (which is always
# implicitly protected by the compressor). Floor at 0 — a value of
# 0 means "preserve only the system prompt + summary + tail", which
# is a legitimate (and common) configuration for long-running
# rolling-compaction sessions.
compression_protect_first = max(
0, int(_compression_cfg.get("protect_first_n", 3))
)
# Read optional explicit context_length override for the auxiliary
# compression model. Custom endpoints often cannot report this via
# /models, so the startup feasibility check needs the config hint.
try:
_aux_cfg = cfg_get(_agent_cfg, "auxiliary", "compression", default={})
except Exception:
_aux_cfg = {}
if isinstance(_aux_cfg, dict):
_aux_context_config = _aux_cfg.get("context_length")
else:
_aux_context_config = None
if _aux_context_config is not None:
try:
_aux_context_config = int(_aux_context_config)
except (TypeError, ValueError):
_aux_context_config = None
self._aux_compression_context_length_config = _aux_context_config
# Read explicit model output-token override from config when the
# caller did not pass one directly.
_model_cfg = _agent_cfg.get("model", {})
if self.max_tokens is None and isinstance(_model_cfg, dict):
_config_max_tokens = _model_cfg.get("max_tokens")
if _config_max_tokens is not None:
try:
if isinstance(_config_max_tokens, bool):
raise ValueError
_parsed_max_tokens = int(_config_max_tokens)
if _parsed_max_tokens <= 0:
raise ValueError
self.max_tokens = _parsed_max_tokens
except (TypeError, ValueError):
logger.warning(
"Invalid model.max_tokens in config.yaml: %r"
"must be a positive integer (e.g. 4096). "
"Falling back to provider default.",
_config_max_tokens,
)
print(
f"\n⚠ Invalid model.max_tokens in config.yaml: {_config_max_tokens!r}\n"
f" Must be a positive integer (e.g. 4096).\n"
f" Falling back to provider default.\n",
file=sys.stderr,
)
self._session_init_model_config["max_tokens"] = self.max_tokens
# Read explicit context_length override from model config
if isinstance(_model_cfg, dict):
_config_context_length = _model_cfg.get("context_length")
else:
_config_context_length = None
if _config_context_length is not None:
try:
_config_context_length = int(_config_context_length)
except (TypeError, ValueError):
logger.warning(
"Invalid model.context_length in config.yaml: %r"
"must be a plain integer (e.g. 256000, not '256K'). "
"Falling back to auto-detection.",
_config_context_length,
)
print(
f"\n⚠ Invalid model.context_length in config.yaml: {_config_context_length!r}\n"
f" Must be a plain integer (e.g. 256000, not '256K').\n"
f" Falling back to auto-detected context window.\n",
file=sys.stderr,
)
_config_context_length = None
# Resolve custom_providers list once for reuse below (startup
# context-length override and plugin context-engine init).
try:
from hermes_cli.config import get_compatible_custom_providers
_custom_providers = get_compatible_custom_providers(_agent_cfg)
except Exception:
_custom_providers = _agent_cfg.get("custom_providers")
if not isinstance(_custom_providers, list):
_custom_providers = []
# Store for reuse by _check_compression_model_feasibility (auxiliary
# compression model context-length detection needs the same list).
self._custom_providers = _custom_providers
# Check custom_providers per-model context_length
if _config_context_length is None and _custom_providers:
try:
from hermes_cli.config import get_custom_provider_context_length
_cp_ctx_resolved = get_custom_provider_context_length(
model=self.model,
base_url=self.base_url,
custom_providers=_custom_providers,
)
if _cp_ctx_resolved:
_config_context_length = int(_cp_ctx_resolved)
except Exception:
_cp_ctx_resolved = None
# Surface a clear warning if the user set a context_length but it
# wasn't a valid positive int — the helper silently skips those.
if _config_context_length is None:
_target = self.base_url.rstrip("/") if self.base_url else ""
for _cp_entry in _custom_providers:
if not isinstance(_cp_entry, dict):
continue
_cp_url = (_cp_entry.get("base_url") or "").rstrip("/")
if _target and _cp_url == _target:
_cp_models = _cp_entry.get("models", {})
if isinstance(_cp_models, dict):
_cp_model_cfg = _cp_models.get(self.model, {})
if isinstance(_cp_model_cfg, dict):
_cp_ctx = _cp_model_cfg.get("context_length")
if _cp_ctx is not None:
try:
_parsed = int(_cp_ctx)
if _parsed <= 0:
raise ValueError
except (TypeError, ValueError):
logger.warning(
"Invalid context_length for model %r in "
"custom_providers: %r — must be a positive "
"integer (e.g. 256000, not '256K'). "
"Falling back to auto-detection.",
self.model, _cp_ctx,
)
print(
f"\n⚠ Invalid context_length for model {self.model!r} in custom_providers: {_cp_ctx!r}\n"
f" Must be a positive integer (e.g. 256000, not '256K').\n"
f" Falling back to auto-detected context window.\n",
file=sys.stderr,
)
break
# Persist for reuse on switch_model / fallback activation. Must come
# AFTER the custom_providers branch so per-model overrides aren't lost.
self._config_context_length = _config_context_length
self._ensure_lmstudio_runtime_loaded(_config_context_length)
# Select context engine: config-driven (like memory providers).
# 1. Check config.yaml context.engine setting
# 2. Check plugins/context_engine/<name>/ directory (repo-shipped)
# 3. Check general plugin system (user-installed plugins)
# 4. Fall back to built-in ContextCompressor
_selected_engine = None
_engine_name = "compressor" # default
try:
_ctx_cfg = _agent_cfg.get("context", {}) if isinstance(_agent_cfg, dict) else {}
_engine_name = _ctx_cfg.get("engine", "compressor") or "compressor"
except Exception:
pass
if _engine_name != "compressor":
# Try loading from plugins/context_engine/<name>/
try:
from plugins.context_engine import load_context_engine
_selected_engine = load_context_engine(_engine_name)
except Exception as _ce_load_err:
logger.debug("Context engine load from plugins/context_engine/: %s", _ce_load_err)
# Try general plugin system as fallback
if _selected_engine is None:
try:
from hermes_cli.plugins import get_plugin_context_engine
_candidate = get_plugin_context_engine()
if _candidate and _candidate.name == _engine_name:
_selected_engine = _candidate
except Exception:
pass
if _selected_engine is None:
logger.warning(
"Context engine '%s' not found — falling back to built-in compressor",
_engine_name,
)
# else: config says "compressor" — use built-in, don't auto-activate plugins
if _selected_engine is not None:
self.context_compressor = _selected_engine
# Resolve context_length for plugin engines — mirrors switch_model() path
from agent.model_metadata import get_model_context_length
_plugin_ctx_len = get_model_context_length(
self.model,
base_url=self.base_url,
api_key=getattr(self, "api_key", ""),
config_context_length=_config_context_length,
provider=self.provider,
custom_providers=_custom_providers,
)
self.context_compressor.update_model(
model=self.model,
context_length=_plugin_ctx_len,
base_url=self.base_url,
api_key=getattr(self, "api_key", ""),
provider=self.provider,
)
if not self.quiet_mode:
logger.info("Using context engine: %s", _selected_engine.name)
else:
self.context_compressor = ContextCompressor(
model=self.model,
threshold_percent=compression_threshold,
protect_first_n=compression_protect_first,
protect_last_n=compression_protect_last,
summary_target_ratio=compression_target_ratio,
summary_model_override=None,
quiet_mode=self.quiet_mode,
base_url=self.base_url,
api_key=getattr(self, "api_key", ""),
config_context_length=_config_context_length,
provider=self.provider,
api_mode=self.api_mode,
)
self.compression_enabled = compression_enabled
# Reject models whose context window is below the minimum required
# for reliable tool-calling workflows (64K tokens).
from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
_ctx = getattr(self.context_compressor, "context_length", 0)
if _ctx and _ctx < MINIMUM_CONTEXT_LENGTH:
raise ValueError(
f"Model {self.model} has a context window of {_ctx:,} tokens, "
f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required "
f"by Hermes Agent. Choose a model with at least "
f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set "
f"model.context_length in config.yaml to override."
)
# Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand).
# Skip names that are already present — the get_tool_definitions()
# quiet_mode cache returned a shared list pre-#17335, so a stray
# mutation here would poison subsequent agent inits in the same
# Gateway process and trip provider-side 'duplicate tool name'
# errors. Even with the cache fix, dedup is the right defense
# against plugin paths that may register the same schemas via
# ctx.register_tool(). Mirrors the memory tools dedup above.
self._context_engine_tool_names: set = set()
if hasattr(self, "context_compressor") and self.context_compressor and self.tools is not None:
_existing_tool_names = {
t.get("function", {}).get("name")
for t in self.tools
if isinstance(t, dict)
}
for _schema in self.context_compressor.get_tool_schemas():
_tname = _schema.get("name", "")
if _tname and _tname in _existing_tool_names:
continue # already registered via plugin/cache path
_wrapped = {"type": "function", "function": _schema}
self.tools.append(_wrapped)
if _tname:
self.valid_tool_names.add(_tname)
self._context_engine_tool_names.add(_tname)
_existing_tool_names.add(_tname)
# Notify context engine of session start
if hasattr(self, "context_compressor") and self.context_compressor:
try:
self.context_compressor.on_session_start(
self.session_id,
hermes_home=str(get_hermes_home()),
platform=self.platform or "cli",
model=self.model,
context_length=getattr(self.context_compressor, "context_length", 0),
)
except Exception as _ce_err:
logger.debug("Context engine on_session_start: %s", _ce_err)
self._subdirectory_hints = SubdirectoryHintTracker(
working_dir=os.getenv("TERMINAL_CWD") or None,
)
self._user_turn_count = 0
# Cumulative token usage for the session
self.session_prompt_tokens = 0
self.session_completion_tokens = 0
self.session_total_tokens = 0
self.session_api_calls = 0
self.session_input_tokens = 0
self.session_output_tokens = 0
self.session_cache_read_tokens = 0
self.session_cache_write_tokens = 0
self.session_reasoning_tokens = 0
self.session_estimated_cost_usd = 0.0
self.session_cost_status = "unknown"
self.session_cost_source = "none"
# ── Ollama num_ctx injection ──
# Ollama defaults to 2048 context regardless of the model's capabilities.
# When running against an Ollama server, detect the model's max context
# and pass num_ctx on every chat request so the full window is used.
# User override: set model.ollama_num_ctx in config.yaml to cap VRAM use.
# If model.context_length is set, it caps num_ctx so the user's VRAM
# budget is respected even when GGUF metadata advertises a larger window.
self._ollama_num_ctx: int | None = None
_ollama_num_ctx_override = None
if isinstance(_model_cfg, dict):
_ollama_num_ctx_override = _model_cfg.get("ollama_num_ctx")
if _ollama_num_ctx_override is not None:
try:
self._ollama_num_ctx = int(_ollama_num_ctx_override)
except (TypeError, ValueError):
logger.debug("Invalid ollama_num_ctx config value: %r", _ollama_num_ctx_override)
if self._ollama_num_ctx is None and self.base_url and is_local_endpoint(self.base_url):
try:
_detected = query_ollama_num_ctx(self.model, self.base_url, api_key=self.api_key or "")
if _detected and _detected > 0:
self._ollama_num_ctx = _detected
except Exception as exc:
logger.debug("Ollama num_ctx detection failed: %s", exc)
# Cap auto-detected ollama_num_ctx to the user's explicit context_length.
# Without this, GGUF metadata can advertise 256K+ which Ollama honours
# by allocating that much VRAM — blowing up small GPUs even though the
# user explicitly set a smaller context_length in config.yaml.
if (
self._ollama_num_ctx
and _config_context_length
and _ollama_num_ctx_override is None # don't override explicit ollama_num_ctx
and self._ollama_num_ctx > _config_context_length
):
logger.info(
"Ollama num_ctx capped: %d -> %d (model.context_length override)",
self._ollama_num_ctx, _config_context_length,
)
self._ollama_num_ctx = _config_context_length
if self._ollama_num_ctx and not self.quiet_mode:
logger.info(
"Ollama num_ctx: will request %d tokens (model max from /api/show)",
self._ollama_num_ctx,
)
if not self.quiet_mode:
if compression_enabled:
print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (compress at {int(compression_threshold*100)}% = {self.context_compressor.threshold_tokens:,})")
else:
print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
# Check immediately so CLI users see the warning at startup.
# Gateway status_callback is not yet wired, so any warning is stored
# in _compression_warning and replayed in the first run_conversation().
self._compression_warning = None
self._check_compression_model_feasibility()
# Snapshot primary runtime for per-turn restoration. When fallback
# activates during a turn, the next turn restores these values so the
# preferred model gets a fresh attempt each time. Uses a single dict
# so new state fields are easy to add without N individual attributes.
_cc = self.context_compressor
self._primary_runtime = {
"model": self.model,
"provider": self.provider,
"base_url": self.base_url,
"api_mode": self.api_mode,
"api_key": getattr(self, "api_key", ""),
"client_kwargs": dict(self._client_kwargs),
"use_prompt_caching": self._use_prompt_caching,
"use_native_cache_layout": self._use_native_cache_layout,
# Context engine state that _try_activate_fallback() overwrites.
# Use getattr for model/base_url/api_key/provider since plugin
# engines may not have these (they're ContextCompressor-specific).
"compressor_model": getattr(_cc, "model", self.model),
"compressor_base_url": getattr(_cc, "base_url", self.base_url),
"compressor_api_key": getattr(_cc, "api_key", ""),
"compressor_provider": getattr(_cc, "provider", self.provider),
"compressor_context_length": _cc.context_length,
"compressor_threshold_tokens": _cc.threshold_tokens,
}
if self.api_mode == "anthropic_messages":
self._primary_runtime.update({
"anthropic_api_key": self._anthropic_api_key,
"anthropic_base_url": self._anthropic_base_url,
"is_anthropic_oauth": self._is_anthropic_oauth,
})
def _get_session_db_for_recall(self):
"""Return a SessionDB for recall, lazily creating it if an entrypoint forgot.
Most frontends pass ``session_db`` into ``AIAgent`` explicitly, but recall
is important enough that a missing constructor argument should degrade by
opening the default state DB instead of making the advertised
``session_search`` tool unusable.
"""
if self._session_db is not None:
return self._session_db
try:
from hermes_state import SessionDB
self._session_db = SessionDB()
return self._session_db
except Exception as exc:
logger.debug("SessionDB unavailable for recall", exc_info=True)
return None
def _ensure_db_session(self) -> None:
"""Create session DB row on first use. Disables _session_db on failure."""
if self._session_db_created or not self._session_db:
return
try:
self._session_db.create_session(
session_id=self.session_id,
source=self.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"),
model=self.model,
model_config=self._session_init_model_config,
system_prompt=self._cached_system_prompt,
user_id=None,
parent_session_id=self._parent_session_id,
)
self._session_db_created = True
except Exception as e:
# Transient failure (e.g. SQLite lock). Keep _session_db alive —
# _session_db_created stays False so next run_conversation() retries.
logger.warning(
"Session DB creation failed (will retry next turn): %s", e
)
def reset_session_state(self):
"""Reset all session-scoped token counters to 0 for a fresh session.
This method encapsulates the reset logic for all session-level metrics
including:
- Token usage counters (input, output, total, prompt, completion)
- Cache read/write tokens
- API call count
- Reasoning tokens
- Estimated cost tracking
- Context compressor internal counters
The method safely handles optional attributes (e.g., context compressor)
using ``hasattr`` checks.
This keeps the counter reset logic DRY and maintainable in one place
rather than scattering it across multiple methods.
"""
# Token usage counters
self.session_total_tokens = 0
self.session_input_tokens = 0
self.session_output_tokens = 0
self.session_prompt_tokens = 0
self.session_completion_tokens = 0
self.session_cache_read_tokens = 0
self.session_cache_write_tokens = 0
self.session_reasoning_tokens = 0
self.session_api_calls = 0
self.session_estimated_cost_usd = 0.0
self.session_cost_status = "unknown"
self.session_cost_source = "none"
# Turn counter (added after reset_session_state was first written — #2635)
self._user_turn_count = 0
# Context engine reset (works for both built-in compressor and plugins)
if hasattr(self, "context_compressor") and self.context_compressor:
self.context_compressor.on_session_reset()
def _ensure_lmstudio_runtime_loaded(self, config_context_length: Optional[int] = None) -> None:
"""
Preload the LM Studio model with at least Hermes' minimum context.
"""
if (self.provider or "").strip().lower() != "lmstudio":
return
try:
from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
from hermes_cli.models import ensure_lmstudio_model_loaded
if config_context_length is None:
config_context_length = getattr(self, "_config_context_length", None)
target_ctx = max(config_context_length or 0, MINIMUM_CONTEXT_LENGTH)
loaded_ctx = ensure_lmstudio_model_loaded(
self.model, self.base_url, getattr(self, "api_key", ""), target_ctx,
)
if loaded_ctx:
# Push into the live compressor so the status bar reflects the
# real loaded ctx the moment the load resolves, instead of
# holding the previous model's value (or "ctx --") through the
# next render tick.
cc = getattr(self, "context_compressor", None)
if cc is not None:
cc.update_model(
model=self.model,
context_length=loaded_ctx,
base_url=self.base_url,
api_key=getattr(self, "api_key", ""),
provider=self.provider,
api_mode=self.api_mode,
)
except Exception as err:
logger.debug("LM Studio preload skipped: %s", err)
def switch_model(self, new_model, new_provider, api_key='', base_url='', api_mode=''):
"""Switch the model/provider in-place for a live agent.
Called by the /model command handlers (CLI and gateway) after
``model_switch.switch_model()`` has resolved credentials and
validated the model. This method performs the actual runtime
swap: rebuilding clients, updating caching flags, and refreshing
the context compressor.
The implementation mirrors ``_try_activate_fallback()`` for the
client-swap logic but also updates ``_primary_runtime`` so the
change persists across turns (unlike fallback which is
turn-scoped).
"""
from hermes_cli.providers import determine_api_mode
# ── Determine api_mode if not provided ──
if not api_mode:
api_mode = determine_api_mode(new_provider, base_url)
# Defense-in-depth: ensure OpenCode base_url doesn't carry a trailing
# /v1 into the anthropic_messages client, which would cause the SDK to
# hit /v1/v1/messages. `model_switch.switch_model()` already strips
# this, but we guard here so any direct callers (future code paths,
# tests) can't reintroduce the double-/v1 404 bug.
if (
api_mode == "anthropic_messages"
and new_provider in {"opencode-zen", "opencode-go"}
and isinstance(base_url, str)
and base_url
):
base_url = re.sub(r"/v1/?$", "", base_url)
old_model = self.model
old_provider = self.provider
# Clear the per-config context_length override so the new model's
# actual context window is resolved via get_model_context_length()
# instead of inheriting the stale value from the previous model.
self._config_context_length = None
# ── Swap core runtime fields ──
self.model = new_model
self.provider = new_provider
# Use new base_url when provided; only fall back to current when the
# new provider genuinely has no endpoint (e.g. native SDK providers).
# Without this guard the old provider's URL (e.g. Ollama's localhost
# address) would persist silently after switching to a cloud provider
# that returns an empty base_url string.
if base_url:
self.base_url = base_url
self.api_mode = api_mode
# Invalidate transport cache — new api_mode may need a different transport
if hasattr(self, "_transport_cache"):
self._transport_cache.clear()
if api_key:
self.api_key = api_key
# ── Build new client ──
if api_mode == "anthropic_messages":
from agent.anthropic_adapter import (
build_anthropic_client,
resolve_anthropic_token,
_is_oauth_token,
)
# Only fall back to ANTHROPIC_TOKEN when the provider is actually Anthropic.
# Other anthropic_messages providers (MiniMax, Alibaba, etc.) must use their own
# API key — falling back would send Anthropic credentials to third-party endpoints.
_is_native_anthropic = new_provider == "anthropic"
effective_key = (api_key or self.api_key or resolve_anthropic_token() or "") if _is_native_anthropic else (api_key or self.api_key or "")
self.api_key = effective_key
self._anthropic_api_key = effective_key
self._anthropic_base_url = base_url or getattr(self, "_anthropic_base_url", None)
self._anthropic_client = build_anthropic_client(
effective_key, self._anthropic_base_url,
timeout=get_provider_request_timeout(self.provider, self.model),
)
self._is_anthropic_oauth = _is_oauth_token(effective_key) if _is_native_anthropic else False
self.client = None
self._client_kwargs = {}
else:
effective_key = api_key or self.api_key
effective_base = base_url or self.base_url
self._client_kwargs = {
"api_key": effective_key,
"base_url": effective_base,
}
_sm_timeout = get_provider_request_timeout(self.provider, self.model)
if _sm_timeout is not None:
self._client_kwargs["timeout"] = _sm_timeout
self.client = self._create_openai_client(
dict(self._client_kwargs),
reason="switch_model",
shared=True,
)
# ── Re-evaluate prompt caching ──
self._use_prompt_caching, self._use_native_cache_layout = (
self._anthropic_prompt_cache_policy(
provider=new_provider,
base_url=self.base_url,
api_mode=api_mode,
model=new_model,
)
)
# ── LM Studio: preload before probing context length ──
self._ensure_lmstudio_runtime_loaded()
# ── Update context compressor ──
if hasattr(self, "context_compressor") and self.context_compressor:
from agent.model_metadata import get_model_context_length
# Re-read custom_providers from live config so per-model
# context_length overrides are honored when switching to a
# custom provider mid-session (closes #15779).
_sm_custom_providers = None
try:
from hermes_cli.config import load_config, get_compatible_custom_providers
_sm_cfg = load_config()
_sm_custom_providers = get_compatible_custom_providers(_sm_cfg)
except Exception:
_sm_custom_providers = None
new_context_length = get_model_context_length(
self.model,
base_url=self.base_url,
api_key=self.api_key,
provider=self.provider,
config_context_length=getattr(self, "_config_context_length", None),
custom_providers=_sm_custom_providers,
)
self.context_compressor.update_model(
model=self.model,
context_length=new_context_length,
base_url=self.base_url,
api_key=getattr(self, "api_key", ""),
provider=self.provider,
api_mode=self.api_mode,
)
# ── Invalidate cached system prompt so it rebuilds next turn ──
self._cached_system_prompt = None
# ── Update _primary_runtime so the change persists across turns ──
_cc = self.context_compressor if hasattr(self, "context_compressor") and self.context_compressor else None
self._primary_runtime = {
"model": self.model,
"provider": self.provider,
"base_url": self.base_url,
"api_mode": self.api_mode,
"api_key": getattr(self, "api_key", ""),
"client_kwargs": dict(self._client_kwargs),
"use_prompt_caching": self._use_prompt_caching,
"use_native_cache_layout": self._use_native_cache_layout,
"compressor_model": getattr(_cc, "model", self.model) if _cc else self.model,
"compressor_base_url": getattr(_cc, "base_url", self.base_url) if _cc else self.base_url,
"compressor_api_key": getattr(_cc, "api_key", "") if _cc else "",
"compressor_provider": getattr(_cc, "provider", self.provider) if _cc else self.provider,
"compressor_context_length": _cc.context_length if _cc else 0,
"compressor_threshold_tokens": _cc.threshold_tokens if _cc else 0,
}
if api_mode == "anthropic_messages":
self._primary_runtime.update({
"anthropic_api_key": self._anthropic_api_key,
"anthropic_base_url": self._anthropic_base_url,
"is_anthropic_oauth": self._is_anthropic_oauth,
})
# ── Reset fallback state ──
self._fallback_activated = False
self._fallback_index = 0
# When the user deliberately swaps primary providers (e.g. openrouter
# → anthropic), drop any fallback entries that target the OLD primary
# or the NEW one. The chain was seeded from config at agent init for
# the original provider — without pruning, a failed turn on the new
# primary silently re-activates the provider the user just rejected,
# which is exactly what was reported during TUI v2 blitz testing
# ("switched to anthropic, tui keeps trying openrouter").
old_norm = (old_provider or "").strip().lower()
new_norm = (new_provider or "").strip().lower()
fallback_chain = list(getattr(self, "_fallback_chain", []) or [])
if old_norm and new_norm and old_norm != new_norm:
fallback_chain = [
entry for entry in fallback_chain
if (entry.get("provider") or "").strip().lower() not in {old_norm, new_norm}
]
self._fallback_chain = fallback_chain
self._fallback_model = fallback_chain[0] if fallback_chain else None
logging.info(
"Model switched in-place: %s (%s) -> %s (%s)",
old_model, old_provider, new_model, new_provider,
)
def _safe_print(self, *args, **kwargs):
"""Print that silently handles broken pipes / closed stdout.
In headless environments (systemd, Docker, nohup) stdout may become
unavailable mid-session. A raw ``print()`` raises ``OSError`` which
can crash cron jobs and lose completed work.
Internally routes through ``self._print_fn`` (default: builtin
``print``) so callers such as the CLI can inject a renderer that
handles ANSI escape sequences properly (e.g. prompt_toolkit's
``print_formatted_text(ANSI(...))``) without touching this method.
"""
try:
fn = self._print_fn or print
fn(*args, **kwargs)
except (OSError, ValueError):
pass
def _vprint(self, *args, force: bool = False, **kwargs):
"""Verbose print — suppressed when actively streaming tokens.
Pass ``force=True`` for error/warning messages that should always be
shown even during streaming playback (TTS or display).
During tool execution (``_executing_tools`` is True), printing is
allowed even with stream consumers registered because no tokens
are being streamed at that point.
After the main response has been delivered and the remaining tool
calls are post-response housekeeping (``_mute_post_response``),
all non-forced output is suppressed.
``suppress_status_output`` is a stricter CLI automation mode used by
parseable single-query flows such as ``hermes chat -q``. In that mode,
all status/diagnostic prints routed through ``_vprint`` are suppressed
so stdout stays machine-readable.
"""
if getattr(self, "suppress_status_output", False):
return
if not force and getattr(self, "_mute_post_response", False):
return
if not force and self._has_stream_consumers() and not self._executing_tools:
return
self._safe_print(*args, **kwargs)
def _should_start_quiet_spinner(self) -> bool:
"""Return True when quiet-mode spinner output has a safe sink.
In headless/stdio-protocol environments, a raw spinner with no custom
``_print_fn`` falls back to ``sys.stdout`` and can corrupt protocol
streams such as ACP JSON-RPC. Allow quiet spinners only when either:
- output is explicitly rerouted via ``_print_fn``; or
- stdout is a real TTY.
"""
if self._print_fn is not None:
return True
stream = getattr(sys, "stdout", None)
if stream is None:
return False
try:
return bool(stream.isatty())
except (AttributeError, ValueError, OSError):
return False
def _should_emit_quiet_tool_messages(self) -> bool:
"""Return True when quiet-mode tool summaries should print directly.
Quiet mode is used by both the interactive CLI and embedded/library
callers. The CLI may still want compact progress hints when no callback
owns rendering. Embedded/library callers, on the other hand, expect
quiet mode to be truly silent.
"""
return (
self.quiet_mode
and not self.tool_progress_callback
and getattr(self, "platform", "") == "cli"
)
def _emit_status(self, message: str) -> None:
"""Emit a lifecycle status message to both CLI and gateway channels.
CLI users see the message via ``_vprint(force=True)`` so it is always
visible regardless of verbose/quiet mode. Gateway consumers receive
it through ``status_callback("lifecycle", ...)``.
This helper never raises — exceptions are swallowed so it cannot
interrupt the retry/fallback logic.
"""
try:
self._vprint(f"{self.log_prefix}{message}", force=True)
except Exception:
pass
if self.status_callback:
try:
self.status_callback("lifecycle", message)
except Exception:
logger.debug("status_callback error in _emit_status", exc_info=True)
def _emit_warning(self, message: str) -> None:
"""Emit a user-visible warning through the same status plumbing.
Unlike debug logs, these warnings are meant for degraded side paths
such as auxiliary compression or memory flushes where the main turn can
continue but the user needs to know something important failed.
"""
try:
self._vprint(f"{self.log_prefix}{message}", force=True)
except Exception:
pass
if self.status_callback:
try:
self.status_callback("warn", message)
except Exception:
logger.debug("status_callback error in _emit_warning", exc_info=True)
# Stream-diagnostic class header preserved for backward compat —
# actual list lives in ``agent.stream_diag.STREAM_DIAG_HEADERS``.
from agent.stream_diag import STREAM_DIAG_HEADERS as _STREAM_DIAG_HEADERS # noqa: E402
@staticmethod
def _stream_diag_init() -> Dict[str, Any]:
"""Forwarder — see ``agent.stream_diag.stream_diag_init``."""
from agent.stream_diag import stream_diag_init
return stream_diag_init()
def _stream_diag_capture_response(
self, diag: Dict[str, Any], http_response: Any
) -> None:
"""Forwarder — see ``agent.stream_diag.stream_diag_capture_response``."""
from agent.stream_diag import stream_diag_capture_response
stream_diag_capture_response(self, diag, http_response)
@staticmethod
def _flatten_exception_chain(error: BaseException) -> str:
"""Forwarder — see ``agent.stream_diag.flatten_exception_chain``."""
from agent.stream_diag import flatten_exception_chain
return flatten_exception_chain(error)
def _log_stream_retry(
self,
*,
kind: str,
error: BaseException,
attempt: int,
max_attempts: int,
mid_tool_call: bool,
diag: Optional[Dict[str, Any]] = None,
) -> None:
"""Forwarder — see ``agent.stream_diag.log_stream_retry``."""
from agent.stream_diag import log_stream_retry
log_stream_retry(
self, kind=kind, error=error, attempt=attempt,
max_attempts=max_attempts, mid_tool_call=mid_tool_call, diag=diag,
)
def _emit_stream_drop(
self,
*,
error: BaseException,
attempt: int,
max_attempts: int,
mid_tool_call: bool,
diag: Optional[Dict[str, Any]] = None,
) -> None:
"""Forwarder — see ``agent.stream_diag.emit_stream_drop``."""
from agent.stream_diag import emit_stream_drop
emit_stream_drop(
self, error=error, attempt=attempt, max_attempts=max_attempts,
mid_tool_call=mid_tool_call, diag=diag,
)
def _emit_auxiliary_failure(self, task: str, exc: BaseException) -> None:
"""Surface a compact warning for failed auxiliary work."""
try:
detail = self._summarize_api_error(exc)
except Exception:
detail = str(exc)
detail = (detail or exc.__class__.__name__).strip()
if len(detail) > 220:
detail = detail[:217].rstrip() + "..."
self._emit_warning(f"⚠ Auxiliary {task} failed: {detail}")
def _current_main_runtime(self) -> Dict[str, str]:
"""Return the live main runtime for session-scoped auxiliary routing."""
return {
"model": getattr(self, "model", "") or "",
"provider": getattr(self, "provider", "") or "",
"base_url": getattr(self, "base_url", "") or "",
"api_key": getattr(self, "api_key", "") or "",
"api_mode": getattr(self, "api_mode", "") or "",
}
def _check_compression_model_feasibility(self) -> None:
"""Forwarder — see ``agent.conversation_compression.check_compression_model_feasibility``."""
from agent.conversation_compression import check_compression_model_feasibility
check_compression_model_feasibility(self)
def _replay_compression_warning(self) -> None:
"""Forwarder — see ``agent.conversation_compression.replay_compression_warning``."""
from agent.conversation_compression import replay_compression_warning
replay_compression_warning(self)
def _is_direct_openai_url(self, base_url: str = None) -> bool:
"""Return True when a base URL targets OpenAI's native API."""
if base_url is not None:
hostname = base_url_hostname(base_url)
else:
hostname = getattr(self, "_base_url_hostname", "") or base_url_hostname(
getattr(self, "_base_url_lower", "")
)
return hostname == "api.openai.com"
def _is_azure_openai_url(self, base_url: str = None) -> bool:
"""Return True when a base URL targets Azure OpenAI.
Azure OpenAI exposes an OpenAI-compatible endpoint at
``{resource}.openai.azure.com/openai/v1`` that accepts the
standard ``openai`` Python client. Unlike api.openai.com it
does NOT support the Responses API — gpt-5.x models are served
on the regular ``/chat/completions`` path — so routing decisions
must treat Azure separately from direct OpenAI.
"""
if base_url is not None:
url = str(base_url).lower()
else:
url = getattr(self, "_base_url_lower", "") or ""
return "openai.azure.com" in url
def _is_github_copilot_url(self, base_url: str = None) -> bool:
"""Return True when a base URL targets GitHub Copilot's OpenAI-compatible API."""
if base_url is not None:
hostname = base_url_hostname(base_url)
else:
hostname = getattr(self, "_base_url_hostname", "") or base_url_hostname(
getattr(self, "_base_url_lower", "")
)
return hostname == "api.githubcopilot.com"
def _resolved_api_call_timeout(self) -> float:
"""Resolve the effective per-call request timeout in seconds.
Priority:
1. ``providers.<id>.models.<model>.timeout_seconds`` (per-model override)
2. ``providers.<id>.request_timeout_seconds`` (provider-wide)
3. ``HERMES_API_TIMEOUT`` env var (legacy escape hatch)
4. 1800.0s default
Used by OpenAI-wire chat completions (streaming and non-streaming) so
the per-provider config knob wins over the 1800s default. Without this
helper, the hardcoded ``HERMES_API_TIMEOUT`` fallback would always be
passed as a per-call ``timeout=`` kwarg, overriding the client-level
timeout the AIAgent.__init__ path configured.
"""
cfg = get_provider_request_timeout(self.provider, self.model)
if cfg is not None:
return cfg
return float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
def _resolved_api_call_stale_timeout_base(self) -> tuple[float, bool]:
"""Resolve the base non-stream stale timeout and whether it is implicit.
Priority:
1. ``providers.<id>.models.<model>.stale_timeout_seconds``
2. ``providers.<id>.stale_timeout_seconds``
3. ``HERMES_API_CALL_STALE_TIMEOUT`` env var
4. 300.0s default
Returns ``(timeout_seconds, uses_implicit_default)`` so the caller can
preserve legacy behaviors that only apply when the user has *not*
explicitly configured a stale timeout, such as auto-disabling the
detector for local endpoints.
"""
cfg = get_provider_stale_timeout(self.provider, self.model)
if cfg is not None:
return cfg, False
env_timeout = os.getenv("HERMES_API_CALL_STALE_TIMEOUT")
if env_timeout is not None:
return float(env_timeout), False
return 300.0, True
def _compute_non_stream_stale_timeout(self, messages: list[dict[str, Any]]) -> float:
"""Compute the effective non-stream stale timeout for this request."""
stale_base, uses_implicit_default = self._resolved_api_call_stale_timeout_base()
base_url = getattr(self, "_base_url", None) or self.base_url or ""
if uses_implicit_default and base_url and is_local_endpoint(base_url):
return float("inf")
est_tokens = sum(len(str(v)) for v in messages) // 4
if est_tokens > 100_000:
return max(stale_base, 600.0)
if est_tokens > 50_000:
return max(stale_base, 450.0)
return stale_base
def _is_openrouter_url(self) -> bool:
"""Return True when the base URL targets OpenRouter."""
return base_url_host_matches(self._base_url_lower, "openrouter.ai")
def _anthropic_prompt_cache_policy(
self,
*,
provider: Optional[str] = None,
base_url: Optional[str] = None,
api_mode: Optional[str] = None,
model: Optional[str] = None,
) -> tuple[bool, bool]:
"""Forwarder — see ``agent.agent_runtime_helpers.anthropic_prompt_cache_policy``."""
from agent.agent_runtime_helpers import anthropic_prompt_cache_policy
return anthropic_prompt_cache_policy(self, provider=provider, base_url=base_url, api_mode=api_mode, model=model)
@staticmethod
def _model_requires_responses_api(model: str) -> bool:
"""Return True for models that require the Responses API path.
GPT-5.x models are rejected on /v1/chat/completions by both
OpenAI and OpenRouter (error: ``unsupported_api_for_model``).
Detect these so the correct api_mode is set regardless of
which provider is serving the model.
"""
m = model.lower()
# Strip vendor prefix (e.g. "openai/gpt-5.4" → "gpt-5.4")
if "/" in m:
m = m.rsplit("/", 1)[-1]
return m.startswith("gpt-5")
@staticmethod
def _provider_model_requires_responses_api(
model: str,
*,
provider: Optional[str] = None,
) -> bool:
"""Return True when this provider/model pair should use Responses API."""
normalized_provider = (provider or "").strip().lower()
# Nous serves GPT-5.x models via its OpenAI-compatible chat
# completions endpoint; its /v1/responses endpoint returns 404.
if normalized_provider == "nous":
return False
if normalized_provider == "copilot":
try:
from hermes_cli.models import _should_use_copilot_responses_api
return _should_use_copilot_responses_api(model)
except Exception:
# Fall back to the generic GPT-5 rule if Copilot-specific
# logic is unavailable for any reason.
pass
return AIAgent._model_requires_responses_api(model)
def _max_tokens_param(self, value: int) -> dict:
"""Return the correct max tokens kwarg for the current provider.
OpenAI's newer models (gpt-4o, o-series, gpt-5+) require
'max_completion_tokens'. Azure OpenAI also requires
'max_completion_tokens' for gpt-5.x models served via the
OpenAI-compatible endpoint. OpenRouter, local models, and older
OpenAI models use 'max_tokens'.
"""
if self._is_direct_openai_url() or self._is_azure_openai_url() or self._is_github_copilot_url():
return {"max_completion_tokens": value}
return {"max_tokens": value}
def _has_content_after_think_block(self, content: str) -> bool:
"""
Check if content has actual text after any reasoning/thinking blocks.
This detects cases where the model only outputs reasoning but no actual
response, which indicates an incomplete generation that should be retried.
Must stay in sync with _strip_think_blocks() tag variants.
Args:
content: The assistant message content to check
Returns:
True if there's meaningful content after think blocks, False otherwise
"""
if not content:
return False
# Remove all reasoning tag variants (must match _strip_think_blocks)
cleaned = self._strip_think_blocks(content)
# Check if there's any non-whitespace content remaining
return bool(cleaned.strip())
def _strip_think_blocks(self, content: str) -> str:
"""Forwarder — see ``agent.agent_runtime_helpers.strip_think_blocks``."""
from agent.agent_runtime_helpers import strip_think_blocks
return strip_think_blocks(self, content)
@staticmethod
def _has_natural_response_ending(content: str) -> bool:
"""Heuristic: does visible assistant text look intentionally finished?"""
if not content:
return False
stripped = content.rstrip()
if not stripped:
return False
if stripped.endswith("```"):
return True
return stripped[-1] in '.!?:)"\']}。!?:)】」』》'
def _is_ollama_glm_backend(self) -> bool:
"""Detect the narrow backend family affected by Ollama/GLM stop misreports."""
model_lower = (self.model or "").lower()
provider_lower = (self.provider or "").lower()
if "glm" not in model_lower and provider_lower != "zai":
return False
if "ollama" in self._base_url_lower or ":11434" in self._base_url_lower:
return True
return bool(self.base_url and is_local_endpoint(self.base_url))
def _should_treat_stop_as_truncated(
self,
finish_reason: str,
assistant_message,
messages: Optional[list] = None,
) -> bool:
"""Detect conservative stop->length misreports for Ollama-hosted GLM models."""
if finish_reason != "stop" or self.api_mode != "chat_completions":
return False
if not self._is_ollama_glm_backend():
return False
if not any(
isinstance(msg, dict) and msg.get("role") == "tool"
for msg in (messages or [])
):
return False
if assistant_message is None or getattr(assistant_message, "tool_calls", None):
return False
content = getattr(assistant_message, "content", None)
if not isinstance(content, str):
return False
visible_text = self._strip_think_blocks(content).strip()
if not visible_text:
return False
if len(visible_text) < 20 or not re.search(r"\s", visible_text):
return False
return not self._has_natural_response_ending(visible_text)
def _looks_like_codex_intermediate_ack(
self,
user_message: str,
assistant_content: str,
messages: List[Dict[str, Any]],
) -> bool:
"""Detect a planning/ack message that should continue instead of ending the turn."""
if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages):
return False
assistant_text = self._strip_think_blocks(assistant_content or "").strip().lower()
if not assistant_text:
return False
if len(assistant_text) > 1200:
return False
has_future_ack = bool(
re.search(r"\b(i[']ll|i will|let me|i can do that|i can help with that)\b", assistant_text)
)
if not has_future_ack:
return False
action_markers = (
"look into",
"look at",
"inspect",
"scan",
"check",
"analyz",
"review",
"explore",
"read",
"open",
"run",
"test",
"fix",
"debug",
"search",
"find",
"walkthrough",
"report back",
"summarize",
)
workspace_markers = (
"directory",
"current directory",
"current dir",
"cwd",
"repo",
"repository",
"codebase",
"project",
"folder",
"filesystem",
"file tree",
"files",
"path",
)
user_text = (user_message or "").strip().lower()
user_targets_workspace = (
any(marker in user_text for marker in workspace_markers)
or "~/" in user_text
or "/" in user_text
)
assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
assistant_targets_workspace = any(
marker in assistant_text for marker in workspace_markers
)
return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action
def _extract_reasoning(self, assistant_message) -> Optional[str]:
"""Forwarder — see ``agent.agent_runtime_helpers.extract_reasoning``."""
from agent.agent_runtime_helpers import extract_reasoning
return extract_reasoning(self, assistant_message)
def _cleanup_task_resources(self, task_id: str) -> None:
"""Forwarder — see ``agent.chat_completion_helpers.cleanup_task_resources``."""
from agent.chat_completion_helpers import cleanup_task_resources
return cleanup_task_resources(self, task_id)
# ------------------------------------------------------------------
# Background memory/skill review — prompts live in agent.background_review
# ------------------------------------------------------------------
from agent.background_review import (
_MEMORY_REVIEW_PROMPT,
_SKILL_REVIEW_PROMPT,
_COMBINED_REVIEW_PROMPT,
)
@staticmethod
def _summarize_background_review_actions(
review_messages: List[Dict],
prior_snapshot: List[Dict],
) -> List[str]:
"""Forwarder — see ``agent.background_review.summarize_background_review_actions``."""
from agent.background_review import summarize_background_review_actions
return summarize_background_review_actions(review_messages, prior_snapshot)
def _spawn_background_review(
self,
messages_snapshot: List[Dict],
review_memory: bool = False,
review_skills: bool = False,
) -> None:
"""Spawn the background memory/skill review thread.
Thin wrapper — the heavy lifting lives in
``agent.background_review.spawn_background_review_thread`` which
returns the thread target. ``threading.Thread`` is constructed
here so existing tests that patch ``run_agent.threading.Thread``
keep working.
"""
from agent.background_review import spawn_background_review_thread
target, _prompt = spawn_background_review_thread(
self,
messages_snapshot,
review_memory=review_memory,
review_skills=review_skills,
)
t = threading.Thread(target=target, daemon=True, name="bg-review")
t.start()
def _build_memory_write_metadata(
self,
*,
write_origin: Optional[str] = None,
execution_context: Optional[str] = None,
task_id: Optional[str] = None,
tool_call_id: Optional[str] = None,
) -> Dict[str, Any]:
"""Forwarder — see ``agent.background_review.build_memory_write_metadata``."""
from agent.background_review import build_memory_write_metadata
return build_memory_write_metadata(
self,
write_origin=write_origin,
execution_context=execution_context,
task_id=task_id,
tool_call_id=tool_call_id,
)
def _apply_persist_user_message_override(self, messages: List[Dict]) -> None:
"""Rewrite the current-turn user message before persistence/return.
Some call paths need an API-only user-message variant without letting
that synthetic text leak into persisted transcripts or resumed session
history. When an override is configured for the active turn, mutate the
in-memory messages list in place so both persistence and returned
history stay clean.
"""
idx = getattr(self, "_persist_user_message_idx", None)
override = getattr(self, "_persist_user_message_override", None)
if override is None or idx is None:
return
if 0 <= idx < len(messages):
msg = messages[idx]
if isinstance(msg, dict) and msg.get("role") == "user":
msg["content"] = override
def _persist_session(self, messages: List[Dict], conversation_history: List[Dict] = None):
"""Save session state to both JSON log and SQLite on any exit path.
Ensures conversations are never lost, even on errors or early returns.
"""
self._drop_trailing_empty_response_scaffolding(messages)
self._apply_persist_user_message_override(messages)
self._session_messages = messages
self._save_session_log(messages)
self._flush_messages_to_session_db(messages, conversation_history)
def _drop_trailing_empty_response_scaffolding(self, messages: List[Dict]) -> None:
"""Remove private empty-response retry/failure scaffolding from transcript tails.
Also rewinds past any trailing tool-result / assistant(tool_calls) pair
that the failed iteration left hanging. Without this, the tail ends at
a raw ``tool`` message and the next user turn lands as
``...tool, user, user`` — a protocol-invalid sequence that most
providers silently reject (returns empty content), causing the
empty-retry loop to fire forever. See #<TBD>.
"""
# Pass 1: strip the flagged scaffolding messages themselves.
dropped_scaffolding = False
while (
messages
and isinstance(messages[-1], dict)
and (
messages[-1].get("_empty_recovery_synthetic")
or messages[-1].get("_empty_terminal_sentinel")
)
):
messages.pop()
dropped_scaffolding = True
# Pass 2: if we stripped scaffolding, rewind through any trailing
# tool-result messages plus the assistant(tool_calls) message that
# produced them. This preserves role alternation so the next user
# message follows a user or assistant message, not an orphan tool
# result. Only runs when scaffolding was actually present — normal
# conversation tails (real tool loops mid-progress) are untouched.
if not dropped_scaffolding:
return
# Drop any trailing tool-result messages
while (
messages
and isinstance(messages[-1], dict)
and messages[-1].get("role") == "tool"
):
messages.pop()
# Drop the assistant message that issued the tool calls, if the tail
# now ends in an assistant-with-tool_calls (the pair that owned the
# just-popped tool results). Without this, the tail is
# ``assistant(tool_calls=...)`` with no tool answers, which some
# providers also reject.
if (
messages
and isinstance(messages[-1], dict)
and messages[-1].get("role") == "assistant"
and messages[-1].get("tool_calls")
):
messages.pop()
def _repair_message_sequence(self, messages: List[Dict]) -> int:
"""Forwarder — see ``agent.agent_runtime_helpers.repair_message_sequence``."""
from agent.agent_runtime_helpers import repair_message_sequence
return repair_message_sequence(self, messages)
def _flush_messages_to_session_db(self, messages: List[Dict], conversation_history: List[Dict] = None):
"""Persist any un-flushed messages to the SQLite session store.
Uses _last_flushed_db_idx to track which messages have already been
written, so repeated calls (from multiple exit paths) only write
truly new messages — preventing the duplicate-write bug (#860).
"""
if not self._session_db:
return
self._apply_persist_user_message_override(messages)
try:
# Retry row creation if the earlier attempt failed transiently.
if not self._session_db_created:
self._ensure_db_session()
start_idx = len(conversation_history) if conversation_history else 0
flush_from = max(start_idx, self._last_flushed_db_idx)
for msg in messages[flush_from:]:
role = msg.get("role", "unknown")
content = msg.get("content")
# Persist multimodal tool results as their text summary only —
# base64 images would bloat the session DB and aren't useful
# for cross-session replay.
if _is_multimodal_tool_result(content):
content = _multimodal_text_summary(content)
elif isinstance(content, list):
# List of OpenAI-style content parts: strip images, keep text.
_txt = []
for p in content:
if isinstance(p, dict) and p.get("type") == "text":
_txt.append(str(p.get("text", "")))
elif isinstance(p, dict) and p.get("type") in {"image", "image_url", "input_image"}:
_txt.append("[screenshot]")
content = "\n".join(_txt) if _txt else None
tool_calls_data = None
if hasattr(msg, "tool_calls") and isinstance(msg.tool_calls, list) and msg.tool_calls:
tool_calls_data = [
{"name": tc.function.name, "arguments": tc.function.arguments}
for tc in msg.tool_calls
]
elif isinstance(msg.get("tool_calls"), list):
tool_calls_data = msg["tool_calls"]
self._session_db.append_message(
session_id=self.session_id,
role=role,
content=content,
tool_name=msg.get("tool_name"),
tool_calls=tool_calls_data,
tool_call_id=msg.get("tool_call_id"),
finish_reason=msg.get("finish_reason"),
reasoning=msg.get("reasoning") if role == "assistant" else None,
reasoning_content=msg.get("reasoning_content") if role == "assistant" else None,
reasoning_details=msg.get("reasoning_details") if role == "assistant" else None,
codex_reasoning_items=msg.get("codex_reasoning_items") if role == "assistant" else None,
codex_message_items=msg.get("codex_message_items") if role == "assistant" else None,
)
self._last_flushed_db_idx = len(messages)
except Exception as e:
logger.warning("Session DB append_message failed: %s", e)
def _get_messages_up_to_last_assistant(self, messages: List[Dict]) -> List[Dict]:
"""
Get messages up to (but not including) the last assistant turn.
This is used when we need to "roll back" to the last successful point
in the conversation, typically when the final assistant message is
incomplete or malformed.
Args:
messages: Full message list
Returns:
Messages up to the last complete assistant turn (ending with user/tool message)
"""
if not messages:
return []
# Find the index of the last assistant message
last_assistant_idx = None
for i in range(len(messages) - 1, -1, -1):
if messages[i].get("role") == "assistant":
last_assistant_idx = i
break
if last_assistant_idx is None:
# No assistant message found, return all messages
return messages.copy()
# Return everything up to (not including) the last assistant message
return messages[:last_assistant_idx]
def _format_tools_for_system_message(self) -> str:
"""Forwarder — see ``agent.system_prompt.format_tools_for_system_message``."""
from agent.system_prompt import format_tools_for_system_message
return format_tools_for_system_message(self)
def _convert_to_trajectory_format(self, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]:
"""Forwarder — see ``agent.agent_runtime_helpers.convert_to_trajectory_format``."""
from agent.agent_runtime_helpers import convert_to_trajectory_format
return convert_to_trajectory_format(self, messages, user_query, completed)
def _save_trajectory(self, messages: List[Dict[str, Any]], user_query: str, completed: bool):
"""
Save conversation trajectory to JSONL file.
Args:
messages (List[Dict]): Complete message history
user_query (str): Original user query
completed (bool): Whether the conversation completed successfully
"""
if not self.save_trajectories:
return
trajectory = self._convert_to_trajectory_format(messages, user_query, completed)
_save_trajectory_to_file(trajectory, self.model, completed)
@staticmethod
def _summarize_api_error(error: Exception) -> str:
"""Extract a human-readable one-liner from an API error.
Handles Cloudflare HTML error pages (502, 503, etc.) by pulling the
<title> tag instead of dumping raw HTML. Falls back to a truncated
str(error) for everything else.
"""
raw = str(error)
# Cloudflare / proxy HTML pages: grab the <title> for a clean summary
if "<!DOCTYPE" in raw or "<html" in raw:
m = re.search(r"<title[^>]*>([^<]+)</title>", raw, re.IGNORECASE)
title = m.group(1).strip() if m else "HTML error page (title not found)"
# Also grab Cloudflare Ray ID if present
ray = re.search(r"Cloudflare Ray ID:\s*<strong[^>]*>([^<]+)</strong>", raw)
ray_id = ray.group(1).strip() if ray else None
status_code = getattr(error, "status_code", None)
parts = []
if status_code:
parts.append(f"HTTP {status_code}")
parts.append(title)
if ray_id:
parts.append(f"Ray {ray_id}")
return "".join(parts)
# JSON body errors from OpenAI/Anthropic SDKs
body = getattr(error, "body", None)
if isinstance(body, dict):
msg = body.get("error", {}).get("message") if isinstance(body.get("error"), dict) else body.get("message")
if msg:
status_code = getattr(error, "status_code", None)
prefix = f"HTTP {status_code}: " if status_code else ""
return f"{prefix}{msg[:300]}"
# Fallback: truncate the raw string but give more room than 200 chars
status_code = getattr(error, "status_code", None)
prefix = f"HTTP {status_code}: " if status_code else ""
return f"{prefix}{raw[:500]}"
def _mask_api_key_for_logs(self, key: Optional[str]) -> Optional[str]:
if not key:
return None
if len(key) <= 12:
return "***"
return f"{key[:8]}...{key[-4:]}"
def _clean_error_message(self, error_msg: str) -> str:
"""
Clean up error messages for user display, removing HTML content and truncating.
Args:
error_msg: Raw error message from API or exception
Returns:
Clean, user-friendly error message
"""
if not error_msg:
return "Unknown error"
# Remove HTML content (common with CloudFlare and gateway error pages)
if error_msg.strip().startswith('<!DOCTYPE html') or '<html' in error_msg:
return "Service temporarily unavailable (HTML error page returned)"
# Remove newlines and excessive whitespace
cleaned = ' '.join(error_msg.split())
# Truncate if too long
if len(cleaned) > 150:
cleaned = cleaned[:150] + "..."
return cleaned
@staticmethod
def _extract_api_error_context(error: Exception) -> Dict[str, Any]:
"""Extract structured rate-limit details from provider errors."""
context: Dict[str, Any] = {}
body = getattr(error, "body", None)
payload = None
if isinstance(body, dict):
payload = body.get("error") if isinstance(body.get("error"), dict) else body
if isinstance(payload, dict):
reason = payload.get("code") or payload.get("error")
if isinstance(reason, str) and reason.strip():
context["reason"] = reason.strip()
message = payload.get("message") or payload.get("error_description")
if isinstance(message, str) and message.strip():
context["message"] = message.strip()
for key in ("resets_at", "reset_at"):
value = payload.get(key)
if value not in {None, ""}:
context["reset_at"] = value
break
retry_after = payload.get("retry_after")
if retry_after not in {None, ""} and "reset_at" not in context:
try:
context["reset_at"] = time.time() + float(retry_after)
except (TypeError, ValueError):
pass
response = getattr(error, "response", None)
headers = getattr(response, "headers", None)
if headers:
retry_after = headers.get("retry-after") or headers.get("Retry-After")
if retry_after and "reset_at" not in context:
try:
context["reset_at"] = time.time() + float(retry_after)
except (TypeError, ValueError):
pass
ratelimit_reset = headers.get("x-ratelimit-reset")
if ratelimit_reset and "reset_at" not in context:
context["reset_at"] = ratelimit_reset
if "message" not in context:
raw_message = str(error).strip()
if raw_message:
context["message"] = raw_message[:500]
if "reset_at" not in context:
message = context.get("message") or ""
if isinstance(message, str):
delay_match = re.search(r"quotaResetDelay[:\s\"]+(\\d+(?:\\.\\d+)?)(ms|s)", message, re.IGNORECASE)
if delay_match:
value = float(delay_match.group(1))
seconds = value / 1000.0 if delay_match.group(2).lower() == "ms" else value
context["reset_at"] = time.time() + seconds
else:
sec_match = re.search(
r"retry\s+(?:after\s+)?(\d+(?:\.\d+)?)\s*(?:sec|secs|seconds|s\b)",
message,
re.IGNORECASE,
)
if sec_match:
context["reset_at"] = time.time() + float(sec_match.group(1))
return context
def _usage_summary_for_api_request_hook(self, response: Any) -> Optional[Dict[str, Any]]:
"""Token buckets for ``post_api_request`` plugins (no raw ``response`` object)."""
if response is None:
return None
raw_usage = getattr(response, "usage", None)
if not raw_usage:
return None
from dataclasses import asdict
cu = normalize_usage(raw_usage, provider=self.provider, api_mode=self.api_mode)
summary = asdict(cu)
summary.pop("raw_usage", None)
summary["prompt_tokens"] = cu.prompt_tokens
summary["total_tokens"] = cu.total_tokens
return summary
def _dump_api_request_debug(
self,
api_kwargs: Dict[str, Any],
*,
reason: str,
error: Optional[Exception] = None,
) -> Optional[Path]:
"""Forwarder — see ``agent.agent_runtime_helpers.dump_api_request_debug``."""
from agent.agent_runtime_helpers import dump_api_request_debug
return dump_api_request_debug(self, api_kwargs, reason=reason, error=error)
@staticmethod
def _clean_session_content(content: str) -> str:
"""Convert REASONING_SCRATCHPAD to think tags and clean up whitespace."""
if not content:
return content
content = convert_scratchpad_to_think(content)
content = re.sub(r'\n+(<think>)', r'\n\1', content)
content = re.sub(r'(</think>)\n+', r'\1\n', content)
return content.strip()
def _save_session_log(self, messages: List[Dict[str, Any]] = None):
"""
Save the full raw session to a JSON file.
Stores every message exactly as the agent sees it: user messages,
assistant messages (with reasoning, finish_reason, tool_calls),
tool responses (with tool_call_id, tool_name), and injected system
messages (compression summaries, todo snapshots, etc.).
REASONING_SCRATCHPAD tags are converted to <think> blocks for consistency.
Overwritten after each turn so it always reflects the latest state.
"""
messages = messages or self._session_messages
if not messages:
return
try:
# Clean assistant content for session logs
cleaned = []
for msg in messages:
if msg.get("role") == "assistant" and msg.get("content"):
msg = dict(msg)
msg["content"] = self._clean_session_content(msg["content"])
cleaned.append(msg)
# Guard: never overwrite a larger session log with fewer messages.
# This protects against data loss when --resume loads a session whose
# messages weren't fully written to SQLite — the resumed agent starts
# with partial history and would otherwise clobber the full JSON log.
if self.session_log_file.exists():
try:
existing = json.loads(self.session_log_file.read_text(encoding="utf-8"))
existing_count = existing.get("message_count", len(existing.get("messages", [])))
if existing_count > len(cleaned):
logging.debug(
"Skipping session log overwrite: existing has %d messages, current has %d",
existing_count, len(cleaned),
)
return
except Exception:
pass # corrupted existing file — allow the overwrite
entry = {
"session_id": self.session_id,
"model": self.model,
"base_url": self.base_url,
"platform": self.platform,
"session_start": self.session_start.isoformat(),
"last_updated": datetime.now().isoformat(),
"system_prompt": self._cached_system_prompt or "",
"tools": self.tools or [],
"message_count": len(cleaned),
"messages": cleaned,
}
atomic_json_write(
self.session_log_file,
entry,
indent=2,
default=str,
)
except Exception as e:
if self.verbose_logging:
logging.warning(f"Failed to save session log: {e}")
def interrupt(self, message: str = None) -> None:
"""
Request the agent to interrupt its current tool-calling loop.
Call this from another thread (e.g., input handler, message receiver)
to gracefully stop the agent and process a new message.
Also signals long-running tool executions (e.g. terminal commands)
to terminate early, so the agent can respond immediately.
Args:
message: Optional new message that triggered the interrupt.
If provided, the agent will include this in its response context.
Example (CLI):
# In a separate input thread:
if user_typed_something:
agent.interrupt(user_input)
Example (Messaging):
# When new message arrives for active session:
if session_has_running_agent:
running_agent.interrupt(new_message.text)
"""
self._interrupt_requested = True
self._interrupt_message = message
# Signal all tools to abort any in-flight operations immediately.
# Scope the interrupt to this agent's execution thread so other
# agents running in the same process (gateway) are not affected.
if self._execution_thread_id is not None:
_set_interrupt(True, self._execution_thread_id)
self._interrupt_thread_signal_pending = False
else:
# The interrupt arrived before run_conversation() finished
# binding the agent to its execution thread. Defer the tool-level
# interrupt signal until startup completes instead of targeting
# the caller thread by mistake.
self._interrupt_thread_signal_pending = True
# Fan out to concurrent-tool worker threads. Those workers run tools
# on their own tids (ThreadPoolExecutor workers), so `is_interrupted()`
# inside a tool only sees an interrupt when their specific tid is in
# the `_interrupted_threads` set. Without this propagation, an
# already-running concurrent tool (e.g. a terminal command hung on
# network I/O) never notices the interrupt and has to run to its own
# timeout. See `_run_tool` for the matching entry/exit bookkeeping.
# `getattr` fallback covers test stubs that build AIAgent via
# object.__new__ and skip __init__.
_tracker = getattr(self, "_tool_worker_threads", None)
_tracker_lock = getattr(self, "_tool_worker_threads_lock", None)
if _tracker is not None and _tracker_lock is not None:
with _tracker_lock:
_worker_tids = list(_tracker)
for _wtid in _worker_tids:
try:
_set_interrupt(True, _wtid)
except Exception:
pass
# Propagate interrupt to any running child agents (subagent delegation)
with self._active_children_lock:
children_copy = list(self._active_children)
for child in children_copy:
try:
child.interrupt(message)
except Exception as e:
logger.debug("Failed to propagate interrupt to child agent: %s", e)
if not self.quiet_mode:
print("\n⚡ Interrupt requested" + (f": '{message[:40]}...'" if message and len(message) > 40 else f": '{message}'" if message else ""))
def clear_interrupt(self) -> None:
"""Clear any pending interrupt request and the per-thread tool interrupt signal."""
self._interrupt_requested = False
self._interrupt_message = None
self._interrupt_thread_signal_pending = False
if self._execution_thread_id is not None:
_set_interrupt(False, self._execution_thread_id)
# Also clear any concurrent-tool worker thread bits. Tracked
# workers normally clear their own bit on exit, but an explicit
# clear here guarantees no stale interrupt can survive a turn
# boundary and fire on a subsequent, unrelated tool call that
# happens to get scheduled onto the same recycled worker tid.
# `getattr` fallback covers test stubs that build AIAgent via
# object.__new__ and skip __init__.
_tracker = getattr(self, "_tool_worker_threads", None)
_tracker_lock = getattr(self, "_tool_worker_threads_lock", None)
if _tracker is not None and _tracker_lock is not None:
with _tracker_lock:
_worker_tids = list(_tracker)
for _wtid in _worker_tids:
try:
_set_interrupt(False, _wtid)
except Exception:
pass
# A hard interrupt supersedes any pending /steer — the steer was
# meant for the agent's next tool-call iteration, which will no
# longer happen. Drop it instead of surprising the user with a
# late injection on the post-interrupt turn.
_steer_lock = getattr(self, "_pending_steer_lock", None)
if _steer_lock is not None:
with _steer_lock:
self._pending_steer = None
def steer(self, text: str) -> bool:
"""
Inject a user message into the next tool result without interrupting.
Unlike interrupt(), this does NOT stop the current tool call. The
text is stashed and the agent loop appends it to the LAST tool
result's content once the current tool batch finishes. The model
sees the steer as part of the tool output on its next iteration.
Thread-safe: callable from gateway/CLI/TUI threads. Multiple calls
before the drain point concatenate with newlines.
Args:
text: The user text to inject. Empty strings are ignored.
Returns:
True if the steer was accepted, False if the text was empty.
"""
if not text or not text.strip():
return False
cleaned = text.strip()
_lock = getattr(self, "_pending_steer_lock", None)
if _lock is None:
# Test stubs that built AIAgent via object.__new__ skip __init__.
# Fall back to direct attribute set; no concurrent callers expected
# in those stubs.
existing = getattr(self, "_pending_steer", None)
self._pending_steer = (existing + "\n" + cleaned) if existing else cleaned
return True
with _lock:
if self._pending_steer:
self._pending_steer = self._pending_steer + "\n" + cleaned
else:
self._pending_steer = cleaned
return True
def _drain_pending_steer(self) -> Optional[str]:
"""Return the pending steer text (if any) and clear the slot.
Safe to call from the agent execution thread after appending tool
results. Returns None when no steer is pending.
"""
_lock = getattr(self, "_pending_steer_lock", None)
if _lock is None:
text = getattr(self, "_pending_steer", None)
self._pending_steer = None
return text
with _lock:
text = self._pending_steer
self._pending_steer = None
return text
def _record_file_mutation_result(
self,
tool_name: str,
args: Dict[str, Any],
result: Any,
is_error: bool,
) -> None:
"""Record a ``write_file`` / ``patch`` outcome for the turn-end verifier.
On failure, store ``{path: {error_preview, tool}}`` entries. On
success, remove any prior failure entries for the same paths (the
model recovered within the turn). Silently no-ops if the per-turn
state dict hasn't been initialised yet (e.g. a tool dispatched
outside ``run_conversation``).
"""
if tool_name not in _FILE_MUTATING_TOOLS:
return
state = getattr(self, "_turn_failed_file_mutations", None)
if state is None:
return
targets = _extract_file_mutation_targets(tool_name, args)
if not targets:
return
landed = file_mutation_result_landed(tool_name, result)
if is_error and not landed:
preview = _extract_error_preview(result)
for path in targets:
# Keep the FIRST error we saw for a given path unless we
# later see success. A repeated failure with a different
# message shouldn't silently overwrite the original.
if path not in state:
state[path] = {
"tool": tool_name,
"error_preview": preview,
}
else:
for path in targets:
state.pop(path, None)
def _file_mutation_verifier_enabled(self) -> bool:
"""Check whether the per-turn file-mutation verifier footer is on.
Config path: ``display.file_mutation_verifier`` (bool, default True).
``HERMES_FILE_MUTATION_VERIFIER`` env var overrides config. Exposed
as a method so tests can patch a single seam without reaching into
the private ``_turn_failed_file_mutations`` state dict.
"""
try:
import os as _os
env = _os.environ.get("HERMES_FILE_MUTATION_VERIFIER")
if env is not None:
return env.strip().lower() not in ("0", "false", "no", "off")
# Read from the persisted config.yaml so gateway and CLI share
# the same setting. Import lazily to avoid a startup-time cycle.
try:
from hermes_cli.config import load_config as _load_config
_cfg = _load_config() or {}
except Exception:
_cfg = {}
_display = _cfg.get("display") if isinstance(_cfg, dict) else None
if isinstance(_display, dict) and "file_mutation_verifier" in _display:
return bool(_display.get("file_mutation_verifier"))
except Exception:
pass
return True # safe default: verifier on
@staticmethod
def _format_file_mutation_failure_footer(failed: Dict[str, Dict[str, Any]]) -> str:
"""Render the per-turn failed-mutation dict as a user-facing footer.
Displays up to 10 paths with their first error preview, then a
count of any additional failures. Returns an empty string when
the dict is empty so callers can concatenate unconditionally.
"""
if not failed:
return ""
lines = [
"⚠️ File-mutation verifier: "
f"{len(failed)} file(s) were NOT modified this turn despite any "
"wording above that may suggest otherwise. Run `git status` or "
"`read_file` to confirm."
]
shown = 0
for path, info in failed.items():
if shown >= 10:
break
preview = (info.get("error_preview") or "").strip()
tool = info.get("tool") or "patch"
if preview:
lines.append(f"{path} — [{tool}] {preview}")
else:
lines.append(f"{path} — [{tool}] failed")
shown += 1
remaining = len(failed) - shown
if remaining > 0:
lines.append(f" • … and {remaining} more")
return "\n".join(lines)
def _apply_pending_steer_to_tool_results(self, messages: list, num_tool_msgs: int) -> None:
"""Append any pending /steer text to the last tool result in this turn.
Called at the end of a tool-call batch, before the next API call.
The steer is appended to the last ``role:"tool"`` message's content
with a clear marker so the model understands it came from the user
and NOT from the tool itself. Role alternation is preserved —
nothing new is inserted, we only modify existing content.
Args:
messages: The running messages list.
num_tool_msgs: Number of tool results appended in this batch;
used to locate the tail slice safely.
"""
if num_tool_msgs <= 0 or not messages:
return
steer_text = self._drain_pending_steer()
if not steer_text:
return
# Find the last tool-role message in the recent tail. Skipping
# non-tool messages defends against future code appending
# something else at the boundary.
target_idx = None
for j in range(len(messages) - 1, max(len(messages) - num_tool_msgs - 1, -1), -1):
msg = messages[j]
if isinstance(msg, dict) and msg.get("role") == "tool":
target_idx = j
break
if target_idx is None:
# No tool result in this batch (e.g. all skipped by interrupt);
# put the steer back so the caller's fallback path can deliver
# it as a normal next-turn user message.
_lock = getattr(self, "_pending_steer_lock", None)
if _lock is not None:
with _lock:
if self._pending_steer:
self._pending_steer = self._pending_steer + "\n" + steer_text
else:
self._pending_steer = steer_text
else:
existing = getattr(self, "_pending_steer", None)
self._pending_steer = (existing + "\n" + steer_text) if existing else steer_text
return
marker = f"\n\nUser guidance: {steer_text}"
existing_content = messages[target_idx].get("content", "")
if not isinstance(existing_content, str):
# Anthropic multimodal content blocks — preserve them and append
# a text block at the end.
try:
blocks = list(existing_content) if existing_content else []
blocks.append({"type": "text", "text": marker.lstrip()})
messages[target_idx]["content"] = blocks
except Exception:
# Fall back to string replacement if content shape is unexpected.
messages[target_idx]["content"] = f"{existing_content}{marker}"
else:
messages[target_idx]["content"] = existing_content + marker
logger.info(
"Delivered /steer to agent after tool batch (%d chars): %s",
len(steer_text),
steer_text[:120] + ("..." if len(steer_text) > 120 else ""),
)
def _touch_activity(self, desc: str) -> None:
"""Update the last-activity timestamp and description (thread-safe)."""
self._last_activity_ts = time.time()
self._last_activity_desc = desc
def _capture_rate_limits(self, http_response: Any) -> None:
"""Parse x-ratelimit-* headers from an HTTP response and cache the state.
Called after each streaming API call. The httpx Response object is
available on the OpenAI SDK Stream via ``stream.response``.
"""
if http_response is None:
return
headers = getattr(http_response, "headers", None)
if not headers:
return
try:
from agent.rate_limit_tracker import parse_rate_limit_headers
state = parse_rate_limit_headers(headers, provider=self.provider)
if state is not None:
self._rate_limit_state = state
except Exception:
pass # Never let header parsing break the agent loop
def get_rate_limit_state(self):
"""Return the last captured RateLimitState, or None."""
return self._rate_limit_state
def _check_openrouter_cache_status(self, http_response: Any) -> None:
"""Read X-OpenRouter-Cache-Status from response headers and log it.
Increments ``_or_cache_hits`` on HIT so callers can report savings.
"""
if http_response is None:
return
headers = getattr(http_response, "headers", None)
if not headers:
return
try:
status = headers.get("x-openrouter-cache-status")
if not status:
return
if status.upper() == "HIT":
self._or_cache_hits += 1
logger.info("OpenRouter response cache HIT (total: %d)", self._or_cache_hits)
else:
logger.debug("OpenRouter response cache %s", status.upper())
except Exception:
pass # Never let header parsing break the agent loop
def get_activity_summary(self) -> dict:
"""Return a snapshot of the agent's current activity for diagnostics.
Called by the gateway timeout handler to report what the agent was doing
when it was killed, and by the periodic "still working" notifications.
"""
elapsed = time.time() - self._last_activity_ts
return {
"last_activity_ts": self._last_activity_ts,
"last_activity_desc": self._last_activity_desc,
"seconds_since_activity": round(elapsed, 1),
"current_tool": self._current_tool,
"api_call_count": self._api_call_count,
"max_iterations": self.max_iterations,
"budget_used": self.iteration_budget.used,
"budget_max": self.iteration_budget.max_total,
}
def shutdown_memory_provider(self, messages: list = None) -> None:
"""Shut down the memory provider and context engine — call at actual session boundaries.
This calls on_session_end() then shutdown_all() on the memory
manager, and on_session_end() on the context engine.
NOT called per-turn — only at CLI exit, /reset, gateway
session expiry, etc.
"""
if self._memory_manager:
try:
self._memory_manager.on_session_end(messages or [])
except Exception:
pass
try:
self._memory_manager.shutdown_all()
except Exception:
pass
# Notify context engine of session end (flush DAG, close DBs, etc.)
if hasattr(self, "context_compressor") and self.context_compressor:
try:
self.context_compressor.on_session_end(
self.session_id or "",
messages or [],
)
except Exception:
pass
def commit_memory_session(self, messages: list = None) -> None:
"""Trigger end-of-session extraction without tearing providers down.
Called when session_id rotates (e.g. /new, context compression);
providers keep their state and continue running under the old
session_id — they just flush pending extraction now."""
if self._memory_manager:
try:
self._memory_manager.on_session_end(messages or [])
except Exception:
pass
# Notify context engine of session end too — same lifecycle moment as
# the memory manager's on_session_end. Without this, engines that
# accumulate per-session state (DAGs, summaries) leak that state from
# the rotated-out session into whatever comes next under the same
# compressor instance. Mirrors the call in shutdown_memory_provider().
# See issue #22394.
if hasattr(self, "context_compressor") and self.context_compressor:
try:
self.context_compressor.on_session_end(
self.session_id or "",
messages or [],
)
except Exception:
pass
def _sync_external_memory_for_turn(
self,
*,
original_user_message: Any,
final_response: Any,
interrupted: bool,
) -> None:
"""Mirror a completed turn into external memory providers.
Called at the end of ``run_conversation`` with the cleaned user
message (``original_user_message``) and the finalised assistant
response. The external memory backend gets both ``sync_all`` (to
persist the exchange) and ``queue_prefetch_all`` (to start
warming context for the next turn) in one shot.
Uses ``original_user_message`` rather than ``user_message``
because the latter may carry injected skill content that bloats
or breaks provider queries.
Interrupted turns are skipped entirely (#15218). A partial
assistant output, an aborted tool chain, or a mid-stream reset
is not durable conversational truth — mirroring it into an
external memory backend pollutes future recall with state the
user never saw completed. The prefetch is gated on the same
flag: the user's next message is almost certainly a retry of
the same intent, and a prefetch keyed on the interrupted turn
would fire against stale context.
Normal completed turns still sync as before. The whole body is
wrapped in ``try/except Exception`` because external memory
providers are strictly best-effort — a misconfigured or offline
backend must not block the user from seeing their response.
"""
if interrupted:
return
if not (self._memory_manager and final_response and original_user_message):
return
try:
self._memory_manager.sync_all(
original_user_message, final_response,
session_id=self.session_id or "",
)
self._memory_manager.queue_prefetch_all(
original_user_message,
session_id=self.session_id or "",
)
except Exception:
pass
def release_clients(self) -> None:
"""Release LLM client resources WITHOUT tearing down session tool state.
Used by the gateway when evicting this agent from _agent_cache for
memory-management reasons (LRU cap or idle TTL) — the session may
resume at any time with a freshly-built AIAgent that reuses the
same task_id / session_id, so we must NOT kill:
- process_registry entries for task_id (user's bg shells)
- terminal sandbox for task_id (cwd, env, shell state)
- browser daemon for task_id (open tabs, cookies)
- memory provider (has its own lifecycle; keeps running)
We DO close:
- OpenAI/httpx client pool (big chunk of held memory + sockets;
the rebuilt agent gets a fresh client anyway)
- Active child subagents (per-turn artefacts; safe to drop)
Safe to call multiple times. Distinct from close() — which is the
hard teardown for actual session boundaries (/new, /reset, session
expiry).
"""
# Close active child agents (per-turn; no cross-turn persistence).
try:
with self._active_children_lock:
children = list(self._active_children)
self._active_children.clear()
for child in children:
try:
child.release_clients()
except Exception:
# Fall back to full close on children; they're per-turn.
try:
child.close()
except Exception:
pass
except Exception:
pass
# Close the OpenAI/httpx client to release sockets immediately.
try:
client = getattr(self, "client", None)
if client is not None:
self._close_openai_client(client, reason="cache_evict", shared=True)
self.client = None
except Exception:
pass
def close(self) -> None:
"""Release all resources held by this agent instance.
Cleans up subprocess resources that would otherwise become orphans:
- Background processes tracked in ProcessRegistry
- Terminal sandbox environments
- Browser daemon sessions
- Active child agents (subagent delegation)
- OpenAI/httpx client connections
Safe to call multiple times (idempotent). Each cleanup step is
independently guarded so a failure in one does not prevent the rest.
"""
task_id = getattr(self, "session_id", None) or ""
# 1. Kill background processes for this task
try:
from tools.process_registry import process_registry
process_registry.kill_all(task_id=task_id)
except Exception:
pass
# 2. Clean terminal sandbox environments
try:
cleanup_vm(task_id)
except Exception:
pass
# 3. Clean browser daemon sessions
try:
cleanup_browser(task_id)
except Exception:
pass
# 4. Close active child agents
try:
with self._active_children_lock:
children = list(self._active_children)
self._active_children.clear()
for child in children:
try:
child.close()
except Exception:
pass
except Exception:
pass
# 5. Close the OpenAI/httpx client
try:
client = getattr(self, "client", None)
if client is not None:
self._close_openai_client(client, reason="agent_close", shared=True)
self.client = None
except Exception:
pass
def _hydrate_todo_store(self, history: List[Dict[str, Any]]) -> None:
"""
Recover todo state from conversation history.
The gateway creates a fresh AIAgent per message, so the in-memory
TodoStore is empty. We scan the history for the most recent todo
tool response and replay it to reconstruct the state.
"""
# Walk history backwards to find the most recent todo tool response
last_todo_response = None
for msg in reversed(history):
if msg.get("role") != "tool":
continue
content = msg.get("content", "")
# Quick check: todo responses contain "todos" key
if '"todos"' not in content:
continue
try:
data = json.loads(content)
if "todos" in data and isinstance(data["todos"], list):
last_todo_response = data["todos"]
break
except (json.JSONDecodeError, TypeError):
continue
if last_todo_response:
# Replay the items into the store (replace mode)
self._todo_store.write(last_todo_response, merge=False)
if not self.quiet_mode:
self._vprint(f"{self.log_prefix}📋 Restored {len(last_todo_response)} todo item(s) from history")
_set_interrupt(False)
@property
def is_interrupted(self) -> bool:
"""Check if an interrupt has been requested."""
return self._interrupt_requested
def _build_system_prompt_parts(self, system_message: str = None) -> Dict[str, str]:
"""Forwarder — see ``agent.system_prompt.build_system_prompt_parts``."""
from agent.system_prompt import build_system_prompt_parts
return build_system_prompt_parts(self, system_message=system_message)
def _build_system_prompt(self, system_message: str = None) -> str:
"""Forwarder — see ``agent.system_prompt.build_system_prompt``."""
from agent.system_prompt import build_system_prompt
return build_system_prompt(self, system_message=system_message)
@staticmethod
def _get_tool_call_id_static(tc) -> str:
"""Extract call ID from a tool_call entry (dict or object)."""
if isinstance(tc, dict):
return tc.get("call_id", "") or tc.get("id", "") or ""
return getattr(tc, "call_id", "") or getattr(tc, "id", "") or ""
@staticmethod
def _get_tool_call_name_static(tc) -> str:
"""Extract function name from a tool_call entry (dict or object).
Gemini's OpenAI-compatibility endpoint requires every `role: tool`
message to carry the matching function name. OpenAI/Anthropic/ollama
tolerate its absence, so the field is best-effort: callers fall back
to "" and the message still works elsewhere.
"""
if isinstance(tc, dict):
fn = tc.get("function")
if isinstance(fn, dict):
return fn.get("name", "") or ""
return ""
fn = getattr(tc, "function", None)
return getattr(fn, "name", "") or ""
_VALID_API_ROLES = frozenset({"system", "user", "assistant", "tool", "function", "developer"})
@staticmethod
def _sanitize_api_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Fix orphaned tool_call / tool_result pairs before every LLM call.
Runs unconditionally — not gated on whether the context compressor
is present — so orphans from session loading or manual message
manipulation are always caught.
"""
# --- Role allowlist: drop messages with roles the API won't accept ---
filtered = []
for msg in messages:
role = msg.get("role")
if role not in AIAgent._VALID_API_ROLES:
logger.debug(
"Pre-call sanitizer: dropping message with invalid role %r",
role,
)
continue
filtered.append(msg)
messages = filtered
surviving_call_ids: set = set()
for msg in messages:
if msg.get("role") == "assistant":
for tc in msg.get("tool_calls") or []:
cid = AIAgent._get_tool_call_id_static(tc)
if cid:
surviving_call_ids.add(cid)
result_call_ids: set = set()
for msg in messages:
if msg.get("role") == "tool":
cid = msg.get("tool_call_id")
if cid:
result_call_ids.add(cid)
# 1. Drop tool results with no matching assistant call
orphaned_results = result_call_ids - surviving_call_ids
if orphaned_results:
messages = [
m for m in messages
if not (m.get("role") == "tool" and m.get("tool_call_id") in orphaned_results)
]
logger.debug(
"Pre-call sanitizer: removed %d orphaned tool result(s)",
len(orphaned_results),
)
# 2. Inject stub results for calls whose result was dropped
missing_results = surviving_call_ids - result_call_ids
if missing_results:
patched: List[Dict[str, Any]] = []
for msg in messages:
patched.append(msg)
if msg.get("role") == "assistant":
for tc in msg.get("tool_calls") or []:
cid = AIAgent._get_tool_call_id_static(tc)
if cid in missing_results:
patched.append({
"role": "tool",
"name": AIAgent._get_tool_call_name_static(tc),
"content": "[Result unavailable — see context summary above]",
"tool_call_id": cid,
})
messages = patched
logger.debug(
"Pre-call sanitizer: added %d stub tool result(s)",
len(missing_results),
)
return messages
@staticmethod
def _is_thinking_only_assistant(msg: Dict[str, Any]) -> bool:
"""Return True if ``msg`` is an assistant turn whose only payload is reasoning.
"Thinking-only" means the model emitted reasoning (``reasoning`` or
``reasoning_content``) but no visible text and no tool_calls. When sent
back to providers that convert reasoning into thinking blocks (native
Anthropic, OpenRouter Anthropic, third-party Anthropic-compatible
gateways), the resulting message has only thinking blocks — which
Anthropic rejects with HTTP 400 "The final block in an assistant
message cannot be `thinking`."
Symmetric with Claude Code's ``filterOrphanedThinkingOnlyMessages``
(src/utils/messages.ts). We drop the whole turn from the API copy
rather than fabricating stub text — the message log (UI transcript)
keeps the reasoning block; only the wire copy is cleaned.
"""
if not isinstance(msg, dict) or msg.get("role") != "assistant":
return False
if msg.get("tool_calls"):
return False
# Does it have any actual output?
content = msg.get("content")
if isinstance(content, str):
if content.strip():
return False
elif isinstance(content, list):
for block in content:
if not isinstance(block, dict):
if block: # non-empty non-dict string etc.
return False
continue
btype = block.get("type")
if btype in {"thinking", "redacted_thinking"}:
continue
if btype == "text":
text = block.get("text", "")
if isinstance(text, str) and text.strip():
return False
continue
# tool_use, image, document, etc. — real payload
return False
elif content is not None and content != "":
return False
# Content is empty-ish. Is there reasoning to make it thinking-only?
reasoning = msg.get("reasoning_content") or msg.get("reasoning")
if isinstance(reasoning, str) and reasoning.strip():
return True
# reasoning_details list form
rd = msg.get("reasoning_details")
if isinstance(rd, list) and rd:
return True
return False
@staticmethod
def _drop_thinking_only_and_merge_users(
messages: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Forwarder — see ``agent.agent_runtime_helpers.drop_thinking_only_and_merge_users``."""
from agent.agent_runtime_helpers import drop_thinking_only_and_merge_users
return drop_thinking_only_and_merge_users(messages)
@staticmethod
def _cap_delegate_task_calls(tool_calls: list) -> list:
"""Truncate excess delegate_task calls to max_concurrent_children.
The delegate_tool caps the task list inside a single call, but the
model can emit multiple separate delegate_task tool_calls in one
turn. This truncates the excess, preserving all non-delegate calls.
Returns the original list if no truncation was needed.
"""
from tools.delegate_tool import _get_max_concurrent_children
max_children = _get_max_concurrent_children()
delegate_count = sum(1 for tc in tool_calls if tc.function.name == "delegate_task")
if delegate_count <= max_children:
return tool_calls
kept_delegates = 0
truncated = []
for tc in tool_calls:
if tc.function.name == "delegate_task":
if kept_delegates < max_children:
truncated.append(tc)
kept_delegates += 1
else:
truncated.append(tc)
logger.warning(
"Truncated %d excess delegate_task call(s) to enforce "
"max_concurrent_children=%d limit",
delegate_count - max_children, max_children,
)
return truncated
@staticmethod
def _deduplicate_tool_calls(tool_calls: list) -> list:
"""Remove duplicate (tool_name, arguments) pairs within a single turn.
Only the first occurrence of each unique pair is kept.
Returns the original list if no duplicates were found.
"""
seen: set = set()
unique: list = []
for tc in tool_calls:
key = (tc.function.name, tc.function.arguments)
if key not in seen:
seen.add(key)
unique.append(tc)
else:
logger.warning("Removed duplicate tool call: %s", tc.function.name)
return unique if len(unique) < len(tool_calls) else tool_calls
def _repair_tool_call(self, tool_name: str) -> str | None:
"""Attempt to repair a mismatched tool name before aborting.
Models sometimes emit variants of a tool name that differ only
in casing, separators, or class-like suffixes. Normalize
aggressively before falling back to fuzzy match:
1. Lowercase direct match.
2. Lowercase + hyphens/spaces -> underscores.
3. CamelCase -> snake_case (TodoTool -> todo_tool).
4. Strip trailing ``_tool`` / ``-tool`` / ``tool`` suffix that
Claude-style models sometimes tack on (TodoTool_tool ->
TodoTool -> Todo -> todo). Applied twice so double-tacked
suffixes like ``TodoTool_tool`` reduce all the way.
5. Fuzzy match (difflib, cutoff=0.7).
See #14784 for the original reports (TodoTool_tool, Patch_tool,
BrowserClick_tool were all returning "Unknown tool" before).
Returns the repaired name if found in valid_tool_names, else None.
"""
import re
from difflib import get_close_matches
if not tool_name:
return None
def _norm(s: str) -> str:
return s.lower().replace("-", "_").replace(" ", "_")
def _camel_snake(s: str) -> str:
return re.sub(r"(?<!^)(?=[A-Z])", "_", s).lower()
def _strip_tool_suffix(s: str) -> str | None:
lc = s.lower()
for suffix in ("_tool", "-tool", "tool"):
if lc.endswith(suffix):
return s[: -len(suffix)].rstrip("_-")
return None
# Cheap fast-paths first — these cover the common case.
lowered = tool_name.lower()
if lowered in self.valid_tool_names:
return lowered
normalized = _norm(tool_name)
if normalized in self.valid_tool_names:
return normalized
# Build the full candidate set for class-like emissions.
cands: set[str] = {tool_name, lowered, normalized, _camel_snake(tool_name)}
# Strip trailing tool-suffix up to twice — TodoTool_tool needs it.
for _ in range(2):
extra: set[str] = set()
for c in cands:
stripped = _strip_tool_suffix(c)
if stripped:
extra.add(stripped)
extra.add(_norm(stripped))
extra.add(_camel_snake(stripped))
cands |= extra
for c in cands:
if c and c in self.valid_tool_names:
return c
# Fuzzy match as last resort.
matches = get_close_matches(lowered, self.valid_tool_names, n=1, cutoff=0.7)
if matches:
return matches[0]
return None
def _invalidate_system_prompt(self):
"""Forwarder — see ``agent.system_prompt.invalidate_system_prompt``."""
from agent.system_prompt import invalidate_system_prompt
invalidate_system_prompt(self)
@staticmethod
def _deterministic_call_id(fn_name: str, arguments: str, index: int = 0) -> str:
"""Generate a deterministic call_id from tool call content.
Used as a fallback when the API doesn't provide a call_id.
Deterministic IDs prevent cache invalidation — random UUIDs would
make every API call's prefix unique, breaking OpenAI's prompt cache.
"""
return _codex_deterministic_call_id(fn_name, arguments, index)
@staticmethod
def _split_responses_tool_id(raw_id: Any) -> tuple[Optional[str], Optional[str]]:
"""Split a stored tool id into (call_id, response_item_id)."""
return _codex_split_responses_tool_id(raw_id)
def _derive_responses_function_call_id(
self,
call_id: str,
response_item_id: Optional[str] = None,
) -> str:
"""Build a valid Responses `function_call.id` (must start with `fc_`)."""
return _codex_derive_responses_function_call_id(call_id, response_item_id)
def _thread_identity(self) -> str:
thread = threading.current_thread()
return f"{thread.name}:{thread.ident}"
def _client_log_context(self) -> str:
provider = getattr(self, "provider", "unknown")
base_url = getattr(self, "base_url", "unknown")
model = getattr(self, "model", "unknown")
return (
f"thread={self._thread_identity()} provider={provider} "
f"base_url={base_url} model={model}"
)
def _openai_client_lock(self) -> threading.RLock:
lock = getattr(self, "_client_lock", None)
if lock is None:
lock = threading.RLock()
self._client_lock = lock
return lock
@staticmethod
def _is_openai_client_closed(client: Any) -> bool:
"""Check if an OpenAI client is closed.
Handles both property and method forms of is_closed:
- httpx.Client.is_closed is a bool property
- openai.OpenAI.is_closed is a method returning bool
Prior bug: getattr(client, "is_closed", False) returned the bound method,
which is always truthy, causing unnecessary client recreation on every call.
"""
from unittest.mock import Mock
if isinstance(client, Mock):
return False
is_closed_attr = getattr(client, "is_closed", None)
if is_closed_attr is not None:
# Handle method (openai SDK) vs property (httpx)
if callable(is_closed_attr):
if is_closed_attr():
return True
elif bool(is_closed_attr):
return True
http_client = getattr(client, "_client", None)
if http_client is not None:
return bool(getattr(http_client, "is_closed", False))
return False
@staticmethod
def _build_keepalive_http_client(base_url: str = "") -> Any:
try:
import httpx as _httpx
import socket as _socket
_sock_opts = [(_socket.SOL_SOCKET, _socket.SO_KEEPALIVE, 1)]
if hasattr(_socket, "TCP_KEEPIDLE"):
_sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPIDLE, 30))
_sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPINTVL, 10))
_sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPCNT, 3))
elif hasattr(_socket, "TCP_KEEPALIVE"):
_sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPALIVE, 30))
# When a custom transport is provided, httpx won't auto-read proxy
# from env vars (allow_env_proxies = trust_env and transport is None).
# Explicitly read proxy settings while still honoring NO_PROXY for
# loopback / local endpoints such as a locally hosted sub2api.
_proxy = _get_proxy_for_base_url(base_url)
return _httpx.Client(
transport=_httpx.HTTPTransport(socket_options=_sock_opts),
proxy=_proxy,
)
except Exception:
return None
def _create_openai_client(self, client_kwargs: dict, *, reason: str, shared: bool) -> Any:
"""Forwarder — see ``agent.agent_runtime_helpers.create_openai_client``."""
from agent.agent_runtime_helpers import create_openai_client
return create_openai_client(self, client_kwargs, reason=reason, shared=shared)
@staticmethod
def _force_close_tcp_sockets(client: Any) -> int:
"""Force-close underlying TCP sockets to prevent CLOSE-WAIT accumulation.
When a provider drops a connection mid-stream, httpx's ``client.close()``
performs a graceful shutdown which leaves sockets in CLOSE-WAIT until the
OS times them out (often minutes). This method walks the httpx transport
pool and issues ``socket.shutdown(SHUT_RDWR)`` + ``socket.close()`` to
force an immediate TCP RST, freeing the file descriptors.
Returns the number of sockets force-closed.
"""
import socket as _socket
closed = 0
try:
http_client = getattr(client, "_client", None)
if http_client is None:
return 0
transport = getattr(http_client, "_transport", None)
if transport is None:
return 0
pool = getattr(transport, "_pool", None)
if pool is None:
return 0
# httpx uses httpcore connection pools; connections live in
# _connections (list) or _pool (list) depending on version.
connections = (
getattr(pool, "_connections", None)
or getattr(pool, "_pool", None)
or []
)
for conn in list(connections):
stream = (
getattr(conn, "_network_stream", None)
or getattr(conn, "_stream", None)
)
if stream is None:
continue
sock = getattr(stream, "_sock", None)
if sock is None:
sock = getattr(stream, "stream", None)
if sock is not None:
sock = getattr(sock, "_sock", None)
if sock is None:
continue
try:
sock.shutdown(_socket.SHUT_RDWR)
except OSError:
pass
try:
sock.close()
except OSError:
pass
closed += 1
except Exception as exc:
logger.debug("Force-close TCP sockets sweep error: %s", exc)
return closed
def _close_openai_client(self, client: Any, *, reason: str, shared: bool) -> None:
if client is None:
return
# Force-close TCP sockets first to prevent CLOSE-WAIT accumulation,
# then do the graceful SDK-level close.
force_closed = self._force_close_tcp_sockets(client)
try:
client.close()
logger.info(
"OpenAI client closed (%s, shared=%s, tcp_force_closed=%d) %s",
reason,
shared,
force_closed,
self._client_log_context(),
)
except Exception as exc:
logger.debug(
"OpenAI client close failed (%s, shared=%s) %s error=%s",
reason,
shared,
self._client_log_context(),
exc,
)
def _replace_primary_openai_client(self, *, reason: str) -> bool:
with self._openai_client_lock():
old_client = getattr(self, "client", None)
try:
new_client = self._create_openai_client(self._client_kwargs, reason=reason, shared=True)
except Exception as exc:
logger.warning(
"Failed to rebuild shared OpenAI client (%s) %s error=%s",
reason,
self._client_log_context(),
exc,
)
return False
self.client = new_client
self._close_openai_client(old_client, reason=f"replace:{reason}", shared=True)
return True
def _ensure_primary_openai_client(self, *, reason: str) -> Any:
with self._openai_client_lock():
client = getattr(self, "client", None)
if client is not None and not self._is_openai_client_closed(client):
return client
logger.warning(
"Detected closed shared OpenAI client; recreating before use (%s) %s",
reason,
self._client_log_context(),
)
if not self._replace_primary_openai_client(reason=f"recreate_closed:{reason}"):
raise RuntimeError("Failed to recreate closed OpenAI client")
with self._openai_client_lock():
return self.client
def _cleanup_dead_connections(self) -> bool:
"""Detect and clean up dead TCP connections on the primary client.
Inspects the httpx connection pool for sockets in unhealthy states
(CLOSE-WAIT, errors). If any are found, force-closes all sockets
and rebuilds the primary client from scratch.
Returns True if dead connections were found and cleaned up.
"""
client = getattr(self, "client", None)
if client is None:
return False
try:
http_client = getattr(client, "_client", None)
if http_client is None:
return False
transport = getattr(http_client, "_transport", None)
if transport is None:
return False
pool = getattr(transport, "_pool", None)
if pool is None:
return False
connections = (
getattr(pool, "_connections", None)
or getattr(pool, "_pool", None)
or []
)
dead_count = 0
for conn in list(connections):
# Check for connections that are idle but have closed sockets
stream = (
getattr(conn, "_network_stream", None)
or getattr(conn, "_stream", None)
)
if stream is None:
continue
sock = getattr(stream, "_sock", None)
if sock is None:
sock = getattr(stream, "stream", None)
if sock is not None:
sock = getattr(sock, "_sock", None)
if sock is None:
continue
# Probe socket health with a non-blocking recv peek
import socket as _socket
try:
sock.setblocking(False)
data = sock.recv(1, _socket.MSG_PEEK | _socket.MSG_DONTWAIT)
if data == b"":
dead_count += 1
except BlockingIOError:
pass # No data available — socket is healthy
except OSError:
dead_count += 1
finally:
try:
sock.setblocking(True)
except OSError:
pass
if dead_count > 0:
logger.warning(
"Found %d dead connection(s) in client pool — rebuilding client",
dead_count,
)
self._replace_primary_openai_client(reason="dead_connection_cleanup")
return True
except Exception as exc:
logger.debug("Dead connection check error: %s", exc)
return False
@staticmethod
def _api_kwargs_have_image_parts(api_kwargs: dict) -> bool:
"""Return True when the outbound request still contains native image parts."""
if not isinstance(api_kwargs, dict):
return False
candidates = []
messages = api_kwargs.get("messages")
if isinstance(messages, list):
candidates.extend(messages)
# Responses API payloads use `input`; after conversion, image parts can
# still be present there instead of in `messages`.
response_input = api_kwargs.get("input")
if isinstance(response_input, list):
candidates.extend(response_input)
def _contains_image(value: Any) -> bool:
if isinstance(value, dict):
ptype = value.get("type")
if ptype in {"image_url", "input_image"}:
return True
return any(_contains_image(v) for v in value.values())
if isinstance(value, list):
return any(_contains_image(v) for v in value)
return False
return any(_contains_image(item) for item in candidates)
def _copilot_headers_for_request(self, *, is_vision: bool) -> dict:
from hermes_cli.copilot_auth import copilot_request_headers
return copilot_request_headers(is_agent_turn=True, is_vision=is_vision)
def _create_request_openai_client(self, *, reason: str, api_kwargs: Optional[dict] = None) -> Any:
from unittest.mock import Mock
primary_client = self._ensure_primary_openai_client(reason=reason)
if isinstance(primary_client, Mock):
return primary_client
with self._openai_client_lock():
request_kwargs = dict(self._client_kwargs)
# Per-request OpenAI-wire clients (used by both the non-streaming
# chat-completions path and the streaming chat-completions path
# in `_interruptible_api_call`) should not run the SDK's built-in
# retry loop: the agent's outer loop owns retries with credential
# rotation, provider fallback, and backoff that the SDK can't
# see. Leaving SDK retries on (default 2) compounds with our outer
# retries and lets a single hung provider request stretch to ~3x
# the per-call timeout before our stale detector reports it.
# Shared/primary clients and Anthropic / Bedrock paths are
# unaffected (they don't go through here).
request_kwargs["max_retries"] = 0
if (
base_url_host_matches(str(request_kwargs.get("base_url", "")), "api.githubcopilot.com")
and self._api_kwargs_have_image_parts(api_kwargs or {})
):
request_kwargs["default_headers"] = self._copilot_headers_for_request(is_vision=True)
return self._create_openai_client(request_kwargs, reason=reason, shared=False)
def _close_request_openai_client(self, client: Any, *, reason: str) -> None:
self._close_openai_client(client, reason=reason, shared=False)
def _run_codex_stream(self, api_kwargs: dict, client: Any = None, on_first_delta: callable = None):
"""Forwarder — see ``agent.codex_runtime.run_codex_stream``."""
from agent.codex_runtime import run_codex_stream
return run_codex_stream(self, api_kwargs, client, on_first_delta)
def _run_codex_create_stream_fallback(self, api_kwargs: dict, client: Any = None):
"""Forwarder — see ``agent.codex_runtime.run_codex_create_stream_fallback``."""
from agent.codex_runtime import run_codex_create_stream_fallback
return run_codex_create_stream_fallback(self, api_kwargs, client)
def _try_refresh_codex_client_credentials(self, *, force: bool = True) -> bool:
if self.api_mode != "codex_responses" or self.provider != "openai-codex":
return False
try:
from hermes_cli.auth import resolve_codex_runtime_credentials
creds = resolve_codex_runtime_credentials(force_refresh=force)
except Exception as exc:
logger.debug("Codex credential refresh failed: %s", exc)
return False
api_key = creds.get("api_key")
base_url = creds.get("base_url")
if not isinstance(api_key, str) or not api_key.strip():
return False
if not isinstance(base_url, str) or not base_url.strip():
return False
self.api_key = api_key.strip()
self.base_url = base_url.strip().rstrip("/")
self._client_kwargs["api_key"] = self.api_key
self._client_kwargs["base_url"] = self.base_url
if not self._replace_primary_openai_client(reason="codex_credential_refresh"):
return False
return True
def _try_refresh_nous_client_credentials(self, *, force: bool = True) -> bool:
if self.api_mode != "chat_completions" or self.provider != "nous":
return False
try:
from hermes_cli.auth import resolve_nous_runtime_credentials
creds = resolve_nous_runtime_credentials(
min_key_ttl_seconds=max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800"))),
timeout_seconds=float(os.getenv("HERMES_NOUS_TIMEOUT_SECONDS", "15")),
force_mint=force,
)
except Exception as exc:
logger.debug("Nous credential refresh failed: %s", exc)
return False
api_key = creds.get("api_key")
base_url = creds.get("base_url")
if not isinstance(api_key, str) or not api_key.strip():
return False
if not isinstance(base_url, str) or not base_url.strip():
return False
self.api_key = api_key.strip()
self.base_url = base_url.strip().rstrip("/")
self._client_kwargs["api_key"] = self.api_key
self._client_kwargs["base_url"] = self.base_url
# Nous requests should not inherit OpenRouter-only attribution headers.
self._client_kwargs.pop("default_headers", None)
if not self._replace_primary_openai_client(reason="nous_credential_refresh"):
return False
return True
def _try_refresh_copilot_client_credentials(self) -> bool:
"""Refresh Copilot credentials and rebuild the shared OpenAI client.
Copilot tokens may remain the same string across refreshes (`gh auth token`
returns a stable OAuth token in many setups). We still rebuild the client
on 401 so retries recover from stale auth/client state without requiring
a session restart.
"""
if self.provider != "copilot":
return False
try:
from hermes_cli.copilot_auth import resolve_copilot_token
new_token, token_source = resolve_copilot_token()
except Exception as exc:
logger.debug("Copilot credential refresh failed: %s", exc)
return False
if not isinstance(new_token, str) or not new_token.strip():
return False
new_token = new_token.strip()
self.api_key = new_token
self._client_kwargs["api_key"] = self.api_key
self._client_kwargs["base_url"] = self.base_url
self._apply_client_headers_for_base_url(str(self.base_url or ""))
if not self._replace_primary_openai_client(reason="copilot_credential_refresh"):
return False
logger.info("Copilot credentials refreshed from %s", token_source)
return True
def _try_refresh_anthropic_client_credentials(self) -> bool:
if self.api_mode != "anthropic_messages" or not hasattr(self, "_anthropic_api_key"):
return False
# Only refresh credentials for the native Anthropic provider.
# Other anthropic_messages providers (MiniMax, Alibaba, etc.) use their own keys.
if self.provider != "anthropic":
return False
# Azure endpoints use static API keys — OAuth token rotation doesn't apply.
# Refreshing would pick up ~/.claude/.credentials.json OAuth token and break auth.
_base = getattr(self, "_anthropic_base_url", "") or ""
if "azure.com" in _base:
return False
try:
from agent.anthropic_adapter import resolve_anthropic_token, build_anthropic_client
new_token = resolve_anthropic_token()
except Exception as exc:
logger.debug("Anthropic credential refresh failed: %s", exc)
return False
if not isinstance(new_token, str) or not new_token.strip():
return False
new_token = new_token.strip()
if new_token == self._anthropic_api_key:
return False
try:
self._anthropic_client.close()
except Exception:
pass
try:
self._anthropic_client = build_anthropic_client(
new_token,
getattr(self, "_anthropic_base_url", None),
timeout=get_provider_request_timeout(self.provider, self.model),
)
except Exception as exc:
logger.warning("Failed to rebuild Anthropic client after credential refresh: %s", exc)
return False
self._anthropic_api_key = new_token
# Update OAuth flag — token type may have changed (API key ↔ OAuth).
# Only treat as OAuth on native Anthropic; third-party endpoints using
# the Anthropic protocol must not trip OAuth paths (#1739 & third-party
# identity-injection guard).
from agent.anthropic_adapter import _is_oauth_token
self._is_anthropic_oauth = _is_oauth_token(new_token) if self.provider == "anthropic" else False
return True
def _apply_client_headers_for_base_url(self, base_url: str) -> None:
from agent.auxiliary_client import _AI_GATEWAY_HEADERS, build_or_headers
if base_url_host_matches(base_url, "openrouter.ai"):
self._client_kwargs["default_headers"] = build_or_headers()
elif base_url_host_matches(base_url, "ai-gateway.vercel.sh"):
self._client_kwargs["default_headers"] = dict(_AI_GATEWAY_HEADERS)
elif base_url_host_matches(base_url, "api.routermint.com"):
self._client_kwargs["default_headers"] = _routermint_headers()
elif base_url_host_matches(base_url, "api.githubcopilot.com"):
from hermes_cli.models import copilot_default_headers
self._client_kwargs["default_headers"] = copilot_default_headers()
elif base_url_host_matches(base_url, "api.kimi.com"):
self._client_kwargs["default_headers"] = {"User-Agent": "claude-code/0.1.0"}
elif base_url_host_matches(base_url, "portal.qwen.ai"):
self._client_kwargs["default_headers"] = _qwen_portal_headers()
elif base_url_host_matches(base_url, "chatgpt.com"):
from agent.auxiliary_client import _codex_cloudflare_headers
self._client_kwargs["default_headers"] = _codex_cloudflare_headers(
self._client_kwargs.get("api_key", "")
)
else:
# No URL-specific headers — check profile.default_headers before clearing.
_ph_headers = None
try:
from providers import get_provider_profile as _gpf2
_ph2 = _gpf2(self.provider)
if _ph2 and _ph2.default_headers:
_ph_headers = dict(_ph2.default_headers)
except Exception:
pass
if _ph_headers:
self._client_kwargs["default_headers"] = _ph_headers
else:
self._client_kwargs.pop("default_headers", None)
def _swap_credential(self, entry) -> None:
runtime_key = getattr(entry, "runtime_api_key", None) or getattr(entry, "access_token", "")
runtime_base = getattr(entry, "runtime_base_url", None) or getattr(entry, "base_url", None) or self.base_url
if self.api_mode == "anthropic_messages":
from agent.anthropic_adapter import build_anthropic_client, _is_oauth_token
try:
self._anthropic_client.close()
except Exception:
pass
self._anthropic_api_key = runtime_key
self._anthropic_base_url = runtime_base
self._anthropic_client = build_anthropic_client(
runtime_key, runtime_base,
timeout=get_provider_request_timeout(self.provider, self.model),
)
self._is_anthropic_oauth = _is_oauth_token(runtime_key) if self.provider == "anthropic" else False
self.api_key = runtime_key
self.base_url = runtime_base
return
self.api_key = runtime_key
self.base_url = runtime_base.rstrip("/") if isinstance(runtime_base, str) else runtime_base
self._client_kwargs["api_key"] = self.api_key
self._client_kwargs["base_url"] = self.base_url
self._apply_client_headers_for_base_url(self.base_url)
self._replace_primary_openai_client(reason="credential_rotation")
def _recover_with_credential_pool(
self,
*,
status_code: Optional[int],
has_retried_429: bool,
classified_reason: Optional[FailoverReason] = None,
error_context: Optional[Dict[str, Any]] = None,
) -> tuple[bool, bool]:
"""Forwarder — see ``agent.agent_runtime_helpers.recover_with_credential_pool``."""
from agent.agent_runtime_helpers import recover_with_credential_pool
return recover_with_credential_pool(self, status_code=status_code, has_retried_429=has_retried_429, classified_reason=classified_reason, error_context=error_context)
def _credential_pool_may_recover_rate_limit(self) -> bool:
"""Whether a rate-limit retry should wait for same-provider credentials."""
pool = self._credential_pool
if pool is None:
return False
if (
self.provider == "google-gemini-cli"
or str(getattr(self, "base_url", "")).startswith("cloudcode-pa://")
):
# CloudCode/Gemini quota windows are usually account-level throttles.
# Prefer the configured fallback immediately instead of waiting out
# Retry-After while a pooled OAuth credential may still appear usable.
return False
return pool.has_available()
def _anthropic_messages_create(self, api_kwargs: dict):
if self.api_mode == "anthropic_messages":
self._try_refresh_anthropic_client_credentials()
return self._anthropic_client.messages.create(**api_kwargs)
def _rebuild_anthropic_client(self) -> None:
"""Rebuild the Anthropic client after an interrupt or stale call.
Handles both direct Anthropic and Bedrock-hosted Anthropic models
correctly — rebuilding with the Bedrock SDK when provider is bedrock,
rather than always falling back to build_anthropic_client() which
requires a direct Anthropic API key.
Honors ``self._oauth_1m_beta_disabled`` (set by the reactive recovery
path when an OAuth subscription rejects the 1M-context beta) so the
rebuilt client carries the reduced beta set.
"""
_drop_1m = bool(getattr(self, "_oauth_1m_beta_disabled", False))
if getattr(self, "provider", None) == "bedrock":
from agent.anthropic_adapter import build_anthropic_bedrock_client
region = getattr(self, "_bedrock_region", "us-east-1") or "us-east-1"
self._anthropic_client = build_anthropic_bedrock_client(region)
else:
from agent.anthropic_adapter import build_anthropic_client
self._anthropic_client = build_anthropic_client(
self._anthropic_api_key,
getattr(self, "_anthropic_base_url", None),
timeout=get_provider_request_timeout(self.provider, self.model),
drop_context_1m_beta=_drop_1m,
)
def _interruptible_api_call(self, api_kwargs: dict):
"""Forwarder — see ``agent.chat_completion_helpers.interruptible_api_call``."""
from agent.chat_completion_helpers import interruptible_api_call
return interruptible_api_call(self, api_kwargs)
# ── Unified streaming API call ─────────────────────────────────────────
def _reset_stream_delivery_tracking(self) -> None:
"""Reset tracking for text delivered during the current model response."""
# Flush any benign partial-tag tail held by the think scrubber
# first (#17924): an innocent '<' at the end of the stream that
# turned out not to be a tag prefix should reach the UI. Then
# flush the context scrubber. Order matters — the think
# scrubber's output feeds into the context scrubber's state.
think_scrubber = getattr(self, "_stream_think_scrubber", None)
if think_scrubber is not None:
think_tail = think_scrubber.flush()
if think_tail:
# Route the tail through the context scrubber too so a
# memory-context span straddling the final boundary is
# still caught.
ctx_scrubber = getattr(self, "_stream_context_scrubber", None)
if ctx_scrubber is not None:
think_tail = ctx_scrubber.feed(think_tail)
if think_tail:
callbacks = [cb for cb in (self.stream_delta_callback, self._stream_callback) if cb is not None]
for cb in callbacks:
try:
cb(think_tail)
except Exception:
pass
self._record_streamed_assistant_text(think_tail)
# Flush any benign partial-tag tail held by the context scrubber so it
# reaches the UI before we clear state for the next model call. If
# the scrubber is mid-span, flush() drops the orphaned content.
scrubber = getattr(self, "_stream_context_scrubber", None)
if scrubber is not None:
tail = scrubber.flush()
if tail:
callbacks = [cb for cb in (self.stream_delta_callback, self._stream_callback) if cb is not None]
for cb in callbacks:
try:
cb(tail)
except Exception:
pass
self._record_streamed_assistant_text(tail)
self._current_streamed_assistant_text = ""
def _record_streamed_assistant_text(self, text: str) -> None:
"""Accumulate visible assistant text emitted through stream callbacks."""
if isinstance(text, str) and text:
self._current_streamed_assistant_text = (
getattr(self, "_current_streamed_assistant_text", "") + text
)
@staticmethod
def _normalize_interim_visible_text(text: str) -> str:
if not isinstance(text, str):
return ""
return re.sub(r"\s+", " ", text).strip()
def _interim_content_was_streamed(self, content: str) -> bool:
visible_content = self._normalize_interim_visible_text(
self._strip_think_blocks(content or "")
)
if not visible_content:
return False
streamed = self._normalize_interim_visible_text(
self._strip_think_blocks(getattr(self, "_current_streamed_assistant_text", "") or "")
)
return bool(streamed) and streamed == visible_content
def _emit_interim_assistant_message(self, assistant_msg: Dict[str, Any]) -> None:
"""Surface a real mid-turn assistant commentary message to the UI layer."""
cb = getattr(self, "interim_assistant_callback", None)
if cb is None or not isinstance(assistant_msg, dict):
return
content = assistant_msg.get("content")
visible = self._strip_think_blocks(content or "").strip()
if not visible or visible == "(empty)":
return
already_streamed = self._interim_content_was_streamed(visible)
try:
cb(visible, already_streamed=already_streamed)
except Exception:
logger.debug("interim_assistant_callback error", exc_info=True)
def _fire_stream_delta(self, text: str) -> None:
"""Fire all registered stream delta callbacks (display + TTS)."""
# If a tool iteration set the break flag, prepend a single paragraph
# break before the first real text delta. This prevents the original
# problem (text concatenation across tool boundaries) without stacking
# blank lines when multiple tool iterations run back-to-back.
if getattr(self, "_stream_needs_break", False) and text and text.strip():
self._stream_needs_break = False
text = "\n\n" + text
prepended_break = True
else:
prepended_break = False
if isinstance(text, str):
# Suppress reasoning/thinking blocks via the stateful
# scrubber (#17924). Earlier versions ran _strip_think_blocks
# per-delta here, which destroyed downstream state machines
# when a tag was split across deltas (e.g. MiniMax-M2.7
# sends '<think>' and its content as separate deltas —
# regex case 2 erased the first delta, so the CLI/gateway
# state machine never saw the open tag and leaked the
# reasoning content as regular response text).
think_scrubber = getattr(self, "_stream_think_scrubber", None)
if think_scrubber is not None:
text = think_scrubber.feed(text or "")
else:
# Defensive: legacy callers without the scrubber attribute.
text = self._strip_think_blocks(text or "")
# Then feed through the stateful context scrubber so memory-context
# spans split across chunks cannot leak to the UI (#5719).
scrubber = getattr(self, "_stream_context_scrubber", None)
if scrubber is not None:
text = scrubber.feed(text)
else:
# Defensive: legacy callers without the scrubber attribute.
text = sanitize_context(text)
# Only strip leading newlines on the first delta — mid-stream "\n" is legitimate markdown.
if not prepended_break and not getattr(
self, "_current_streamed_assistant_text", ""
):
text = text.lstrip("\n")
if not text:
return
callbacks = [cb for cb in (self.stream_delta_callback, self._stream_callback) if cb is not None]
delivered = False
for cb in callbacks:
try:
cb(text)
delivered = True
except Exception:
pass
if delivered:
self._record_streamed_assistant_text(text)
def _fire_reasoning_delta(self, text: str) -> None:
"""Fire reasoning callback if registered."""
cb = self.reasoning_callback
if cb is not None:
try:
cb(text)
except Exception:
pass
def _fire_tool_gen_started(self, tool_name: str) -> None:
"""Notify display layer that the model is generating tool call arguments.
Fires once per tool name when the streaming response begins producing
tool_call / tool_use tokens. Gives the TUI a chance to show a spinner
or status line so the user isn't staring at a frozen screen while a
large tool payload (e.g. a 45 KB write_file) is being generated.
"""
cb = self.tool_gen_callback
if cb is not None:
try:
cb(tool_name)
except Exception:
pass
def _has_stream_consumers(self) -> bool:
"""Return True if any streaming consumer is registered."""
return (
self.stream_delta_callback is not None
or getattr(self, "_stream_callback", None) is not None
)
def _interruptible_streaming_api_call(
self, api_kwargs: dict, *, on_first_delta: callable = None
):
"""Forwarder — see ``agent.chat_completion_helpers.interruptible_streaming_api_call``."""
from agent.chat_completion_helpers import interruptible_streaming_api_call
return interruptible_streaming_api_call(self, api_kwargs, on_first_delta=on_first_delta)
def _try_activate_fallback(self, reason: "FailoverReason | None" = None) -> bool:
"""Forwarder — see ``agent.chat_completion_helpers.try_activate_fallback``."""
from agent.chat_completion_helpers import try_activate_fallback
return try_activate_fallback(self, reason)
# ── Per-turn primary restoration ─────────────────────────────────────
def _restore_primary_runtime(self) -> bool:
"""Forwarder — see ``agent.agent_runtime_helpers.restore_primary_runtime``."""
from agent.agent_runtime_helpers import restore_primary_runtime
return restore_primary_runtime(self)
def _try_recover_primary_transport(
self, api_error: Exception, *, retry_count: int, max_retries: int,
) -> bool:
"""Forwarder — see ``agent.agent_runtime_helpers.try_recover_primary_transport``."""
from agent.agent_runtime_helpers import try_recover_primary_transport
return try_recover_primary_transport(self, api_error, retry_count=retry_count, max_retries=max_retries)
@staticmethod
def _content_has_image_parts(content: Any) -> bool:
if not isinstance(content, list):
return False
for part in content:
if isinstance(part, dict) and part.get("type") in {"image_url", "input_image"}:
return True
return False
@staticmethod
def _materialize_data_url_for_vision(image_url: str) -> tuple[str, Optional[Path]]:
header, _, data = str(image_url or "").partition(",")
mime = "image/jpeg"
if header.startswith("data:"):
mime_part = header[len("data:"):].split(";", 1)[0].strip()
if mime_part.startswith("image/"):
mime = mime_part
suffix = {
"image/png": ".png",
"image/gif": ".gif",
"image/webp": ".webp",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
}.get(mime, ".jpg")
tmp = tempfile.NamedTemporaryFile(prefix="anthropic_image_", suffix=suffix, delete=False)
try:
with tmp:
tmp.write(base64.b64decode(data))
except Exception:
# delete=False means a corrupt/unsupported data URL would otherwise
# leak a zero-byte temp file on every failed materialization.
try:
os.unlink(tmp.name)
except OSError:
pass
raise
path = Path(tmp.name)
return str(path), path
def _describe_image_for_anthropic_fallback(self, image_url: str, role: str) -> str:
cache_key = hashlib.sha256(str(image_url or "").encode("utf-8")).hexdigest()
cached = self._anthropic_image_fallback_cache.get(cache_key)
if cached:
return cached
role_label = {
"assistant": "assistant",
"tool": "tool result",
}.get(role, "user")
analysis_prompt = (
"Describe everything visible in this image in thorough detail. "
"Include any text, code, UI, data, objects, people, layout, colors, "
"and any other notable visual information."
)
vision_source = str(image_url or "")
cleanup_path: Optional[Path] = None
if vision_source.startswith("data:"):
vision_source, cleanup_path = self._materialize_data_url_for_vision(vision_source)
description = ""
try:
from tools.vision_tools import vision_analyze_tool
result_json = asyncio.run(
vision_analyze_tool(image_url=vision_source, user_prompt=analysis_prompt)
)
result = json.loads(result_json) if isinstance(result_json, str) else {}
description = (result.get("analysis") or "").strip()
except Exception as e:
description = f"Image analysis failed: {e}"
finally:
if cleanup_path and cleanup_path.exists():
try:
cleanup_path.unlink()
except OSError:
pass
if not description:
description = "Image analysis failed."
note = f"[The {role_label} attached an image. Here's what it contains:\n{description}]"
if vision_source and not str(image_url or "").startswith("data:"):
note += (
f"\n[If you need a closer look, use vision_analyze with image_url: {vision_source}]"
)
self._anthropic_image_fallback_cache[cache_key] = note
return note
def _model_supports_vision(self) -> bool:
"""Return True if the active provider+model reports native vision.
Used to decide whether to strip image content parts from API-bound
messages (for non-vision models) or let the provider adapter handle
them natively (for vision-capable models).
"""
try:
from agent.models_dev import get_model_capabilities
provider = (getattr(self, "provider", "") or "").strip()
model = (getattr(self, "model", "") or "").strip()
if not provider or not model:
return False
caps = get_model_capabilities(provider, model)
if caps is None:
return False
return bool(caps.supports_vision)
except Exception:
return False
def _preprocess_anthropic_content(self, content: Any, role: str) -> Any:
if not self._content_has_image_parts(content):
return content
text_parts: List[str] = []
image_notes: List[str] = []
for part in content:
if isinstance(part, str):
if part.strip():
text_parts.append(part.strip())
continue
if not isinstance(part, dict):
continue
ptype = part.get("type")
if ptype in {"text", "input_text"}:
text = str(part.get("text", "") or "").strip()
if text:
text_parts.append(text)
continue
if ptype in {"image_url", "input_image"}:
image_data = part.get("image_url", {})
image_url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data or "")
if image_url:
image_notes.append(self._describe_image_for_anthropic_fallback(image_url, role))
else:
image_notes.append("[An image was attached but no image source was available.]")
continue
text = str(part.get("text", "") or "").strip()
if text:
text_parts.append(text)
prefix = "\n\n".join(note for note in image_notes if note).strip()
suffix = "\n".join(text for text in text_parts if text).strip()
if prefix and suffix:
return f"{prefix}\n\n{suffix}"
if prefix:
return prefix
if suffix:
return suffix
return "[A multimodal message was converted to text for Anthropic compatibility.]"
def _get_transport(self, api_mode: str = None):
"""Return the cached transport for the given (or current) api_mode.
Lazy-initializes on first call per api_mode. Returns None if no
transport is registered for the mode.
"""
mode = api_mode or self.api_mode
cache = getattr(self, "_transport_cache", None)
if cache is None:
cache = {}
self._transport_cache = cache
t = cache.get(mode)
if t is None:
from agent.transports import get_transport
t = get_transport(mode)
cache[mode] = t
return t
def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list:
# Fast exit when no message carries image content at all.
if not any(
isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
for msg in api_messages
):
return api_messages
# The Anthropic adapter (agent/anthropic_adapter.py:_convert_content_part_to_anthropic)
# already translates OpenAI-style image_url/input_image parts into
# native Anthropic ``{"type": "image", "source": ...}`` blocks. When
# the active model supports vision we let the adapter do its job and
# skip this legacy text-fallback preprocessor entirely.
if self._model_supports_vision():
return api_messages
# Non-vision Anthropic model (rare today, but keep the fallback for
# compat): replace each image part with a vision_analyze text note.
transformed = copy.deepcopy(api_messages)
for msg in transformed:
if not isinstance(msg, dict):
continue
msg["content"] = self._preprocess_anthropic_content(
msg.get("content"),
str(msg.get("role", "user") or "user"),
)
return transformed
def _prepare_messages_for_non_vision_model(self, api_messages: list) -> list:
"""Strip native image parts when the active model lacks vision.
Runs on the chat.completions / codex_responses paths. Vision-capable
models pass through unchanged (provider and any downstream translator
handle the image parts natively). Non-vision models get each image
replaced by a cached vision_analyze text description so the turn
doesn't fail with "model does not support image input".
"""
if not any(
isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
for msg in api_messages
):
return api_messages
if self._model_supports_vision():
return api_messages
transformed = copy.deepcopy(api_messages)
for msg in transformed:
if not isinstance(msg, dict):
continue
# Reuse the Anthropic text-fallback preprocessor — the behaviour is
# identical (walk content parts, replace images with cached
# descriptions, merge back into a single text or structured
# content). Naming is historical.
msg["content"] = self._preprocess_anthropic_content(
msg.get("content"),
str(msg.get("role", "user") or "user"),
)
return transformed
def _tool_result_content_for_active_model(self, tool_name: str, result: Any) -> Any:
"""Return the tool message content that is safe for the active model.
Multimodal tool results normally unwrap to OpenAI-style content parts so
vision-capable models can inspect screenshots. Text-only providers must
not receive those image parts, because a rejected tool result becomes
part of the canonical history and can make the next user turn fail before
the agent has a chance to recover.
"""
if not _is_multimodal_tool_result(result):
return result
content = result.get("content") or []
if not self._content_has_image_parts(content):
return content
if self._model_supports_vision():
return content
summary = _multimodal_text_summary(result)
if tool_name == "computer_use":
return json.dumps({
"error": (
"computer_use returned screenshot/image content, but the active "
"model/provider does not support image input. Switch to a "
"vision-capable model for desktop computer use, or use browser "
"tools for browser tasks."
),
"text_summary": summary,
})
logger.warning(
"Tool %s returned image content for non-vision model %s/%s; "
"falling back to text summary",
tool_name,
self.provider,
self.model,
)
return summary
def _try_shrink_image_parts_in_messages(self, api_messages: list) -> bool:
"""Forwarder — see ``agent.conversation_compression.try_shrink_image_parts_in_messages``."""
from agent.conversation_compression import try_shrink_image_parts_in_messages
return try_shrink_image_parts_in_messages(api_messages)
def _anthropic_preserve_dots(self) -> bool:
"""True when using an anthropic-compatible endpoint that preserves dots in model names.
Alibaba/DashScope keeps dots (e.g. qwen3.5-plus).
MiniMax keeps dots (e.g. MiniMax-M2.7).
Xiaomi MiMo keeps dots (e.g. mimo-v2.5, mimo-v2.5-pro).
OpenCode Go/Zen keeps dots for non-Claude models (e.g. minimax-m2.5-free).
ZAI/Zhipu keeps dots (e.g. glm-4.7, glm-5.1).
AWS Bedrock uses dotted inference-profile IDs
(e.g. ``global.anthropic.claude-opus-4-7``,
``us.anthropic.claude-sonnet-4-5-20250929-v1:0``) and rejects
the hyphenated form with
``HTTP 400 The provided model identifier is invalid``.
Regression for #11976; mirrors the opencode-go fix for #5211
(commit f77be22c), which extended this same allowlist."""
if (getattr(self, "provider", "") or "").lower() in {
"alibaba", "minimax", "minimax-cn",
"opencode-go", "opencode-zen",
"zai", "bedrock",
"xiaomi",
}:
return True
base = (getattr(self, "base_url", "") or "").lower()
return (
"dashscope" in base
or "aliyuncs" in base
or "minimax" in base
or "opencode.ai/zen/" in base
or "bigmodel.cn" in base
or "xiaomimimo.com" in base
# AWS Bedrock runtime endpoints — defense-in-depth when
# ``provider`` is unset but ``base_url`` still names Bedrock.
or "bedrock-runtime." in base
)
def _is_qwen_portal(self) -> bool:
"""Return True when the base URL targets Qwen Portal."""
return base_url_host_matches(self._base_url_lower, "portal.qwen.ai")
def _qwen_prepare_chat_messages(self, api_messages: list) -> list:
prepared = copy.deepcopy(api_messages)
if not prepared:
return prepared
for msg in prepared:
if not isinstance(msg, dict):
continue
content = msg.get("content")
if isinstance(content, str):
msg["content"] = [{"type": "text", "text": content}]
elif isinstance(content, list):
# Normalize: convert bare strings to text dicts, keep dicts as-is.
# deepcopy already created independent copies, no need for dict().
normalized_parts = []
for part in content:
if isinstance(part, str):
normalized_parts.append({"type": "text", "text": part})
elif isinstance(part, dict):
normalized_parts.append(part)
if normalized_parts:
msg["content"] = normalized_parts
# Inject cache_control on the last part of the system message.
for msg in prepared:
if isinstance(msg, dict) and msg.get("role") == "system":
content = msg.get("content")
if isinstance(content, list) and content and isinstance(content[-1], dict):
content[-1]["cache_control"] = {"type": "ephemeral"}
break
return prepared
def _qwen_prepare_chat_messages_inplace(self, messages: list) -> None:
"""In-place variant — mutates an already-copied message list."""
if not messages:
return
for msg in messages:
if not isinstance(msg, dict):
continue
content = msg.get("content")
if isinstance(content, str):
msg["content"] = [{"type": "text", "text": content}]
elif isinstance(content, list):
normalized_parts = []
for part in content:
if isinstance(part, str):
normalized_parts.append({"type": "text", "text": part})
elif isinstance(part, dict):
normalized_parts.append(part)
if normalized_parts:
msg["content"] = normalized_parts
for msg in messages:
if isinstance(msg, dict) and msg.get("role") == "system":
content = msg.get("content")
if isinstance(content, list) and content and isinstance(content[-1], dict):
content[-1]["cache_control"] = {"type": "ephemeral"}
break
def _build_api_kwargs(self, api_messages: list) -> dict:
"""Forwarder — see ``agent.chat_completion_helpers.build_api_kwargs``."""
from agent.chat_completion_helpers import build_api_kwargs
return build_api_kwargs(self, api_messages)
def _supports_reasoning_extra_body(self) -> bool:
"""Return True when reasoning extra_body is safe to send for this route/model.
OpenRouter forwards unknown extra_body fields to upstream providers.
Some providers/routes reject `reasoning` with 400s, so gate it to
known reasoning-capable model families and direct Nous Portal.
"""
if base_url_host_matches(self._base_url_lower, "nousresearch.com"):
return True
if base_url_host_matches(self._base_url_lower, "ai-gateway.vercel.sh"):
return True
if (
base_url_host_matches(self._base_url_lower, "models.github.ai")
or base_url_host_matches(self._base_url_lower, "api.githubcopilot.com")
):
try:
from hermes_cli.models import github_model_reasoning_efforts
return bool(github_model_reasoning_efforts(self.model))
except Exception:
return False
if (self.provider or "").strip().lower() == "lmstudio":
opts = self._lmstudio_reasoning_options_cached()
# "off-only" (or absent) means no real reasoning capability.
return any(opt and opt != "off" for opt in opts)
if "openrouter" not in self._base_url_lower:
return False
if "api.mistral.ai" in self._base_url_lower:
return False
model = (self.model or "").lower()
reasoning_model_prefixes = (
"deepseek/",
"anthropic/",
"openai/",
"x-ai/",
"google/gemini-2",
"qwen/qwen3",
"tencent/hy3-preview",
"xiaomi/",
)
return any(model.startswith(prefix) for prefix in reasoning_model_prefixes)
def _lmstudio_reasoning_options_cached(self) -> list[str]:
"""Probe LM Studio's published reasoning ``allowed_options`` once per
(model, base_url). The list (e.g. ``["off","on"]`` or
``["off","minimal","low"]``) is needed both for the supports-reasoning
gate and for clamping the emitted ``reasoning_effort`` so toggle-style
models don't 400 on ``high``. Cache is keyed on (model, base_url) so
``/model`` swaps and base-URL changes don't reuse a stale list.
Non-empty results are cached permanently (model capabilities don't
change). Empty results (transient probe failure OR genuinely
non-reasoning model) are cached with a 60-second TTL to avoid an
HTTP round-trip on every turn while still retrying reasonably soon.
"""
import time as _time
cache = getattr(self, "_lm_reasoning_opts_cache", None)
if cache is None:
cache = self._lm_reasoning_opts_cache = {}
key = (self.model, self.base_url)
cached = cache.get(key)
if cached is not None:
opts, ts = cached
# Non-empty → permanent. Empty → 60s TTL.
if opts or (_time.monotonic() - ts) < 60:
return opts
try:
from hermes_cli.models import lmstudio_model_reasoning_options
opts = lmstudio_model_reasoning_options(
self.model, self.base_url, getattr(self, "api_key", ""),
)
except Exception:
opts = []
cache[key] = (opts, _time.monotonic())
return opts
def _resolve_lmstudio_summary_reasoning_effort(self) -> Optional[str]:
"""Resolve a safe top-level ``reasoning_effort`` for LM Studio.
The iteration-limit summary path calls ``chat.completions.create()``
directly, bypassing the transport. Share the helper so the two paths
can't drift on effort resolution and clamping.
"""
from agent.lmstudio_reasoning import resolve_lmstudio_effort
return resolve_lmstudio_effort(
self.reasoning_config,
self._lmstudio_reasoning_options_cached(),
)
def _github_models_reasoning_extra_body(self) -> dict | None:
"""Format reasoning payload for GitHub Models/OpenAI-compatible routes."""
try:
from hermes_cli.models import github_model_reasoning_efforts
except Exception:
return None
supported_efforts = github_model_reasoning_efforts(self.model)
if not supported_efforts:
return None
if self.reasoning_config and isinstance(self.reasoning_config, dict):
if self.reasoning_config.get("enabled") is False:
return None
requested_effort = str(
self.reasoning_config.get("effort", "medium")
).strip().lower()
else:
requested_effort = "medium"
if requested_effort == "xhigh" and "high" in supported_efforts:
requested_effort = "high"
elif requested_effort not in supported_efforts:
if requested_effort == "minimal" and "low" in supported_efforts:
requested_effort = "low"
elif "medium" in supported_efforts:
requested_effort = "medium"
else:
requested_effort = supported_efforts[0]
return {"effort": requested_effort}
def _build_assistant_message(self, assistant_message, finish_reason: str) -> dict:
"""Forwarder — see ``agent.chat_completion_helpers.build_assistant_message``."""
from agent.chat_completion_helpers import build_assistant_message
return build_assistant_message(self, assistant_message, finish_reason)
def _needs_thinking_reasoning_pad(self) -> bool:
"""Return True when the active provider enforces reasoning_content echo-back.
DeepSeek v4 thinking and Kimi / Moonshot thinking both reject replays
of assistant tool-call messages that omit ``reasoning_content`` (refs
#15250, #17400). Xiaomi MiMo thinking mode has the same requirement.
"""
return (
self._needs_deepseek_tool_reasoning()
or self._needs_kimi_tool_reasoning()
or self._needs_mimo_tool_reasoning()
)
def _needs_kimi_tool_reasoning(self) -> bool:
"""Return True when the current provider is Kimi / Moonshot thinking mode.
Kimi ``/coding`` and Moonshot thinking mode both require
``reasoning_content`` on every assistant tool-call message; omitting
it causes the next replay to fail with HTTP 400.
"""
return (
self.provider in {"kimi-coding", "kimi-coding-cn"}
or base_url_host_matches(self.base_url, "api.kimi.com")
or base_url_host_matches(self.base_url, "moonshot.ai")
or base_url_host_matches(self.base_url, "moonshot.cn")
)
def _needs_deepseek_tool_reasoning(self) -> bool:
"""Return True when the current provider is DeepSeek thinking mode.
DeepSeek V4 thinking mode requires ``reasoning_content`` on every
assistant tool-call turn; omitting it causes HTTP 400 when the
message is replayed in a subsequent API request (#15250).
"""
provider = (self.provider or "").lower()
model = (self.model or "").lower()
return (
provider == "deepseek"
or "deepseek" in model
or base_url_host_matches(self.base_url, "api.deepseek.com")
)
def _needs_mimo_tool_reasoning(self) -> bool:
"""Return True when the current provider is Xiaomi MiMo thinking mode.
MiMo thinking mode requires ``reasoning_content`` on every assistant
tool-call message when replaying history; omitting it causes HTTP 400.
Refs: https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/passing-back-reasoning_content
"""
provider = (self.provider or "").lower()
model = (self.model or "").lower()
return (
provider == "xiaomi"
or "mimo" in model
or base_url_host_matches(self.base_url, "api.xiaomimimo.com")
or base_url_host_matches(self.base_url, "xiaomimimo.com")
)
def _copy_reasoning_content_for_api(self, source_msg: dict, api_msg: dict) -> None:
"""Copy provider-facing reasoning fields onto an API replay message."""
if source_msg.get("role") != "assistant":
return
# 1. Explicit reasoning_content already set — preserve it verbatim
# (includes DeepSeek/Kimi's own space-placeholder written at creation
# time, and any valid reasoning content from the same provider).
#
# Exception: sessions persisted BEFORE #17341 have empty-string
# placeholders pinned at creation time. DeepSeek V4 Pro rejects
# those with HTTP 400. When the active provider enforces the
# thinking-mode echo, upgrade "" → " " on replay so stale history
# doesn't 400 the user on the next turn.
existing = source_msg.get("reasoning_content")
if isinstance(existing, str):
if existing == "" and self._needs_thinking_reasoning_pad():
api_msg["reasoning_content"] = " "
else:
api_msg["reasoning_content"] = existing
return
needs_thinking_pad = self._needs_thinking_reasoning_pad()
# 2. Cross-provider poisoned history (#15748): on DeepSeek/Kimi,
# if the source turn has tool_calls AND a 'reasoning' field but no
# 'reasoning_content' key, the 'reasoning' text was written by a
# prior provider (e.g. MiniMax) — DeepSeek's own _build_assistant_message
# pins reasoning_content at creation time for tool-call turns, so the
# shape (reasoning set, reasoning_content absent, tool_calls present)
# is unreachable from same-provider DeepSeek history after this fix.
# Inject a single space to satisfy the API without leaking another
# provider's chain of thought to DeepSeek/Kimi. Space (not "")
# because DeepSeek V4 Pro rejects empty-string reasoning_content
# in thinking mode (refs #17341).
normalized_reasoning = source_msg.get("reasoning")
if (
needs_thinking_pad
and source_msg.get("tool_calls")
and isinstance(normalized_reasoning, str)
and normalized_reasoning
):
api_msg["reasoning_content"] = " "
return
# 3. Healthy session: promote 'reasoning' field to 'reasoning_content'
# for providers that use the internal 'reasoning' key.
# This must happen before the unconditional empty-string fallback so
# genuine reasoning content is not overwritten (#15812 regression in
# PR #15478).
if isinstance(normalized_reasoning, str) and normalized_reasoning:
api_msg["reasoning_content"] = normalized_reasoning
return
# 4. DeepSeek / Kimi thinking mode: all assistant messages need
# reasoning_content. Inject a single space to satisfy the provider's
# requirement when no explicit reasoning content is present. Covers
# both tool-call turns (already-poisoned history with no reasoning
# at all) and plain text turns. Space (not "") because DeepSeek V4
# Pro tightened validation and rejects empty string with HTTP 400
# ("The reasoning content in the thinking mode must be passed back
# to the API"). Refs #17341.
if needs_thinking_pad:
api_msg["reasoning_content"] = " "
return
# 5. reasoning_content was present but not a string (e.g. None after
# context compaction). Don't pass null to the API.
api_msg.pop("reasoning_content", None)
@staticmethod
def _sanitize_tool_calls_for_strict_api(api_msg: dict) -> dict:
"""Strip Codex Responses API fields from tool_calls for strict providers.
Providers like Mistral, Fireworks, and other strict OpenAI-compatible APIs
validate the Chat Completions schema and reject unknown fields (call_id,
response_item_id) with 400 or 422 errors. These fields are preserved in
the internal message history — this method only modifies the outgoing
API copy.
Creates new tool_call dicts rather than mutating in-place, so the
original messages list retains call_id/response_item_id for Codex
Responses API compatibility (e.g. if the session falls back to a
Codex provider later).
Fields stripped: call_id, response_item_id
"""
tool_calls = api_msg.get("tool_calls")
if not isinstance(tool_calls, list):
return api_msg
_STRIP_KEYS = {"call_id", "response_item_id"}
api_msg["tool_calls"] = [
{k: v for k, v in tc.items() if k not in _STRIP_KEYS}
if isinstance(tc, dict) else tc
for tc in tool_calls
]
return api_msg
@staticmethod
def _sanitize_tool_call_arguments(
messages: list,
*,
logger=None,
session_id: str = None,
) -> int:
"""Forwarder — see ``agent.agent_runtime_helpers.sanitize_tool_call_arguments``."""
from agent.agent_runtime_helpers import sanitize_tool_call_arguments
return sanitize_tool_call_arguments(messages, logger=logger, session_id=session_id)
def _should_sanitize_tool_calls(self) -> bool:
"""Determine if tool_calls need sanitization for strict APIs.
Codex Responses API uses fields like call_id and response_item_id
that are not part of the standard Chat Completions schema. These
fields must be stripped when calling any other API to avoid
validation errors (400 Bad Request).
Returns:
bool: True if sanitization is needed (non-Codex API), False otherwise.
"""
return self.api_mode != "codex_responses"
def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default", focus_topic: str = None) -> tuple:
"""Forwarder — see ``agent.conversation_compression.compress_context``."""
from agent.conversation_compression import compress_context
return compress_context(
self, messages, system_message,
approx_tokens=approx_tokens, task_id=task_id, focus_topic=focus_topic,
)
def _set_tool_guardrail_halt(self, decision: ToolGuardrailDecision) -> None:
"""Record the first guardrail decision that should stop this turn."""
if decision.should_halt and self._tool_guardrail_halt_decision is None:
self._tool_guardrail_halt_decision = decision
def _toolguard_controlled_halt_response(self, decision: ToolGuardrailDecision) -> str:
tool = decision.tool_name or "a tool"
return (
f"I stopped retrying {tool} because it hit the tool-call guardrail "
f"({decision.code}) after {decision.count} repeated non-progressing "
"attempts. The last tool result explains the blocker; the next step is "
"to change strategy instead of repeating the same call."
)
def _append_guardrail_observation(
self,
tool_name: str,
function_args: dict,
function_result: str,
*,
failed: bool,
) -> str:
decision = self._tool_guardrails.after_call(
tool_name,
function_args,
function_result,
failed=failed,
)
if decision.action in {"warn", "halt"}:
function_result = append_toolguard_guidance(function_result, decision)
if decision.should_halt:
self._set_tool_guardrail_halt(decision)
return function_result
def _guardrail_block_result(self, decision: ToolGuardrailDecision) -> str:
self._set_tool_guardrail_halt(decision)
return toolguard_synthetic_result(decision)
def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
"""Execute tool calls from the assistant message and append results to messages.
Dispatches to concurrent execution only for batches that look
independent: read-only tools may always share the parallel path, while
file reads/writes may do so only when their target paths do not overlap.
"""
tool_calls = assistant_message.tool_calls
# Allow _vprint during tool execution even with stream consumers
self._executing_tools = True
try:
if not _should_parallelize_tool_batch(tool_calls):
return self._execute_tool_calls_sequential(
assistant_message, messages, effective_task_id, api_call_count
)
return self._execute_tool_calls_concurrent(
assistant_message, messages, effective_task_id, api_call_count
)
finally:
self._executing_tools = False
def _dispatch_delegate_task(self, function_args: dict) -> str:
"""Single call site for delegate_task dispatch.
New DELEGATE_TASK_SCHEMA fields only need to be added here to reach all
invocation paths (concurrent, sequential, inline).
"""
from tools.delegate_tool import delegate_task as _delegate_task
return _delegate_task(
goal=function_args.get("goal"),
context=function_args.get("context"),
toolsets=function_args.get("toolsets"),
tasks=function_args.get("tasks"),
max_iterations=function_args.get("max_iterations"),
acp_command=function_args.get("acp_command"),
acp_args=function_args.get("acp_args"),
role=function_args.get("role"),
parent_agent=self,
)
def _invoke_tool(self, function_name: str, function_args: dict, effective_task_id: str,
tool_call_id: Optional[str] = None, messages: list = None,
pre_tool_block_checked: bool = False) -> str:
"""Invoke a single tool and return the result string. No display logic.
Handles both agent-level tools (todo, memory, etc.) and registry-dispatched
tools. Used by the concurrent execution path; the sequential path retains
its own inline invocation for backward-compatible display handling.
"""
# Check plugin hooks for a block directive before executing anything.
block_message: Optional[str] = None
if not pre_tool_block_checked:
try:
from hermes_cli.plugins import get_pre_tool_call_block_message
block_message = get_pre_tool_call_block_message(
function_name, function_args, task_id=effective_task_id or "",
)
except Exception:
pass
if block_message is not None:
return json.dumps({"error": block_message}, ensure_ascii=False)
if function_name == "todo":
from tools.todo_tool import todo_tool as _todo_tool
return _todo_tool(
todos=function_args.get("todos"),
merge=function_args.get("merge", False),
store=self._todo_store,
)
elif function_name == "session_search":
session_db = self._get_session_db_for_recall()
if not session_db:
from hermes_state import format_session_db_unavailable
return json.dumps({"success": False, "error": format_session_db_unavailable()})
from tools.session_search_tool import session_search as _session_search
return _session_search(
query=function_args.get("query", ""),
role_filter=function_args.get("role_filter"),
limit=function_args.get("limit", 3),
db=session_db,
current_session_id=self.session_id,
)
elif function_name == "memory":
target = function_args.get("target", "memory")
from tools.memory_tool import memory_tool as _memory_tool
result = _memory_tool(
action=function_args.get("action"),
target=target,
content=function_args.get("content"),
old_text=function_args.get("old_text"),
store=self._memory_store,
)
# Bridge: notify external memory provider of built-in memory writes
if self._memory_manager and function_args.get("action") in {"add", "replace"}:
try:
self._memory_manager.on_memory_write(
function_args.get("action", ""),
target,
function_args.get("content", ""),
metadata=self._build_memory_write_metadata(
task_id=effective_task_id,
tool_call_id=tool_call_id,
),
)
except Exception:
pass
return result
elif self._memory_manager and self._memory_manager.has_tool(function_name):
return self._memory_manager.handle_tool_call(function_name, function_args)
elif function_name == "clarify":
from tools.clarify_tool import clarify_tool as _clarify_tool
return _clarify_tool(
question=function_args.get("question", ""),
choices=function_args.get("choices"),
callback=self.clarify_callback,
)
elif function_name == "delegate_task":
return self._dispatch_delegate_task(function_args)
else:
return handle_function_call(
function_name, function_args, effective_task_id,
tool_call_id=tool_call_id,
session_id=self.session_id or "",
enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
skip_pre_tool_call_hook=True,
)
@staticmethod
def _wrap_verbose(label: str, text: str, indent: str = " ") -> str:
"""Word-wrap verbose tool output to fit the terminal width.
Splits *text* on existing newlines and wraps each line individually,
preserving intentional line breaks (e.g. pretty-printed JSON).
Returns a ready-to-print string with *label* on the first line and
continuation lines indented.
"""
import shutil as _shutil
import textwrap as _tw
cols = _shutil.get_terminal_size((120, 24)).columns
wrap_width = max(40, cols - len(indent))
out_lines: list[str] = []
for raw_line in text.split("\n"):
if len(raw_line) <= wrap_width:
out_lines.append(raw_line)
else:
wrapped = _tw.wrap(raw_line, width=wrap_width,
break_long_words=True,
break_on_hyphens=False)
out_lines.extend(wrapped or [raw_line])
body = ("\n" + indent).join(out_lines)
return f"{indent}{label}{body}"
def _execute_tool_calls_concurrent(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
"""Forwarder — see ``agent.tool_executor.execute_tool_calls_concurrent``."""
from agent.tool_executor import execute_tool_calls_concurrent
return execute_tool_calls_concurrent(self, assistant_message, messages, effective_task_id, api_call_count)
def _execute_tool_calls_sequential(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
"""Forwarder — see ``agent.tool_executor.execute_tool_calls_sequential``."""
from agent.tool_executor import execute_tool_calls_sequential
return execute_tool_calls_sequential(self, assistant_message, messages, effective_task_id, api_call_count)
def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
"""Forwarder — see ``agent.chat_completion_helpers.handle_max_iterations``."""
from agent.chat_completion_helpers import handle_max_iterations
return handle_max_iterations(self, messages, api_call_count)
def run_conversation(
self,
user_message: str,
system_message: str = None,
conversation_history: List[Dict[str, Any]] = None,
task_id: str = None,
stream_callback: Optional[callable] = None,
persist_user_message: Optional[str] = None,
) -> Dict[str, Any]:
"""
Run a complete conversation with tool calling until completion.
Args:
user_message (str): The user's message/question
system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided)
conversation_history (List[Dict]): Previous conversation messages (optional)
task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided)
stream_callback: Optional callback invoked with each text delta during streaming.
Used by the TTS pipeline to start audio generation before the full response.
When None (default), API calls use the standard non-streaming path.
persist_user_message: Optional clean user message to store in
transcripts/history when user_message contains API-only
synthetic prefixes.
or queuing follow-up prefetch work.
Returns:
Dict: Complete conversation result with final response and message history
"""
# Guard stdio against OSError from broken pipes (systemd/headless/daemon).
# Installed once, transparent when streams are healthy, prevents crash on write.
_install_safe_stdio()
self._ensure_db_session()
# Tell auxiliary_client what the live main provider/model are for
# this turn. Used by tools whose behaviour depends on the active
# main model (e.g. vision_analyze's native fast path) so they see
# the CLI/gateway override instead of the stale config.yaml
# default. Idempotent — fine to call every turn.
try:
from agent.auxiliary_client import set_runtime_main
set_runtime_main(
getattr(self, "provider", "") or "",
getattr(self, "model", "") or "",
)
except Exception:
pass
# Tag all log records on this thread with the session ID so
# ``hermes logs --session <id>`` can filter a single conversation.
from hermes_logging import set_session_context
set_session_context(self.session_id)
# Bind the skill write-origin ContextVar for this thread so tool
# handlers (e.g. skill_manage create) can tell whether they are
# running inside the background self-improvement review fork vs.
# a foreground user-directed turn. Set at the top of each call;
# the review fork runs on its own thread with a fresh context,
# so the foreground value here does not leak into it.
from tools.skill_provenance import set_current_write_origin
set_current_write_origin(getattr(self, "_memory_write_origin", "assistant_tool"))
# If the previous turn activated fallback, restore the primary
# runtime so this turn gets a fresh attempt with the preferred model.
# No-op when _fallback_activated is False (gateway, first turn, etc.).
self._restore_primary_runtime()
# Sanitize surrogate characters from user input. Clipboard paste from
# rich-text editors (Google Docs, Word, etc.) can inject lone surrogates
# that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK.
if isinstance(user_message, str):
user_message = _sanitize_surrogates(user_message)
if isinstance(persist_user_message, str):
persist_user_message = _sanitize_surrogates(persist_user_message)
# Store stream callback for _interruptible_api_call to pick up
self._stream_callback = stream_callback
self._persist_user_message_idx = None
self._persist_user_message_override = persist_user_message
# Generate unique task_id if not provided to isolate VMs between concurrent tasks
effective_task_id = task_id or str(uuid.uuid4())
# Expose the active task_id so tools running mid-turn (e.g. delegate_task
# in delegate_tool.py) can identify this agent for the cross-agent file
# state registry. Set BEFORE any tool dispatch so snapshots taken at
# child-launch time see the parent's real id, not None.
self._current_task_id = effective_task_id
# Reset retry counters and iteration budget at the start of each turn
# so subagent usage from a previous turn doesn't eat into the next one.
self._invalid_tool_retries = 0
self._invalid_json_retries = 0
self._empty_content_retries = 0
self._incomplete_scratchpad_retries = 0
self._codex_incomplete_retries = 0
self._thinking_prefill_retries = 0
self._post_tool_empty_retried = False
self._last_content_with_tools = None
self._last_content_tools_all_housekeeping = False
self._mute_post_response = False
self._unicode_sanitization_passes = 0
self._tool_guardrails.reset_for_turn()
self._tool_guardrail_halt_decision = None
# True until the server rejects an image_url content part with an error
# like "Only 'text' content type is supported." Set to False on first
# rejection and kept False for the rest of the session so we never re-send
# images to a text-only endpoint. Scoped per `_run()` call, not per instance.
self._vision_supported = True
# Pre-turn connection health check: detect and clean up dead TCP
# connections left over from provider outages or dropped streams.
# This prevents the next API call from hanging on a zombie socket.
if self.api_mode != "anthropic_messages":
try:
if self._cleanup_dead_connections():
self._emit_status(
"🔌 Detected stale connections from a previous provider "
"issue — cleaned up automatically. Proceeding with fresh "
"connection."
)
except Exception:
pass
# Replay compression warning through status_callback for gateway
# platforms (the callback was not wired during __init__).
if self._compression_warning:
self._replay_compression_warning()
self._compression_warning = None # send once
# NOTE: _turns_since_memory and _iters_since_skill are NOT reset here.
# They are initialized in __init__ and must persist across run_conversation
# calls so that nudge logic accumulates correctly in CLI mode.
self.iteration_budget = IterationBudget(self.max_iterations)
# Log conversation turn start for debugging/observability
_preview_text = _summarize_user_message_for_log(user_message)
_msg_preview = (_preview_text[:80] + "...") if len(_preview_text) > 80 else _preview_text
_msg_preview = _msg_preview.replace("\n", " ")
logger.info(
"conversation turn: session=%s model=%s provider=%s platform=%s history=%d msg=%r",
self.session_id or "none", self.model, self.provider or "unknown",
self.platform or "unknown", len(conversation_history or []),
_msg_preview,
)
# Initialize conversation (copy to avoid mutating the caller's list)
messages = list(conversation_history) if conversation_history else []
# Hydrate todo store from conversation history (gateway creates a fresh
# AIAgent per message, so the in-memory store is empty -- we need to
# recover the todo state from the most recent todo tool response in history)
if conversation_history and not self._todo_store.has_items():
self._hydrate_todo_store(conversation_history)
# Hydrate per-session nudge counters from persisted history.
# Gateway creates a fresh AIAgent per inbound message (cache miss /
# 1h idle eviction / config-signature mismatch / process restart), so
# _turns_since_memory and _user_turn_count start at 0 every turn and
# the memory.nudge_interval trigger may never be reached. Reconstruct
# an effective count from prior user turns in conversation_history.
# Idempotent: a cached agent that already accumulated counters keeps
# them; only a freshly-built agent with empty in-memory state hydrates.
# See issue #22357.
if conversation_history and self._user_turn_count == 0:
prior_user_turns = sum(
1 for m in conversation_history if m.get("role") == "user"
)
if prior_user_turns > 0:
self._user_turn_count = prior_user_turns
if self._memory_nudge_interval > 0 and self._turns_since_memory == 0:
# % preserves original 1-in-N cadence rather than firing a
# review immediately on resume (which would surprise users
# whose session happened to land just past a multiple of N).
self._turns_since_memory = prior_user_turns % self._memory_nudge_interval
# Prefill messages (few-shot priming) are injected at API-call time only,
# never stored in the messages list. This keeps them ephemeral: they won't
# be saved to session DB, session logs, or batch trajectories, but they're
# automatically re-applied on every API call (including session continuations).
# Track user turns for memory flush and periodic nudge logic
self._user_turn_count += 1
# Reset the streaming context scrubber at the top of each turn so a
# hung span from a prior interrupted stream can't taint this turn's
# output.
scrubber = getattr(self, "_stream_context_scrubber", None)
if scrubber is not None:
scrubber.reset()
# Reset the think scrubber for the same reason — an interrupted
# prior stream may have left us inside an unterminated block.
think_scrubber = getattr(self, "_stream_think_scrubber", None)
if think_scrubber is not None:
think_scrubber.reset()
# Preserve the original user message (no nudge injection).
original_user_message = persist_user_message if persist_user_message is not None else user_message
# Track memory nudge trigger (turn-based, checked here).
# Skill trigger is checked AFTER the agent loop completes, based on
# how many tool iterations THIS turn used.
_should_review_memory = False
if (self._memory_nudge_interval > 0
and "memory" in self.valid_tool_names
and self._memory_store):
self._turns_since_memory += 1
if self._turns_since_memory >= self._memory_nudge_interval:
_should_review_memory = True
self._turns_since_memory = 0
# Add user message
user_msg = {"role": "user", "content": user_message}
messages.append(user_msg)
current_turn_user_idx = len(messages) - 1
self._persist_user_message_idx = current_turn_user_idx
if not self.quiet_mode:
_print_preview = _summarize_user_message_for_log(user_message)
self._safe_print(f"💬 Starting conversation: '{_print_preview[:60]}{'...' if len(_print_preview) > 60 else ''}'")
# ── System prompt (cached per session for prefix caching) ──
# Built once on first call, reused for all subsequent calls.
# Only rebuilt after context compression events (which invalidate
# the cache and reload memory from disk).
#
# For continuing sessions (gateway creates a fresh AIAgent per
# message), we load the stored system prompt from the session DB
# instead of rebuilding. Rebuilding would pick up memory changes
# from disk that the model already knows about (it wrote them!),
# producing a different system prompt and breaking the Anthropic
# prefix cache.
if self._cached_system_prompt is None:
stored_prompt = None
if conversation_history and self._session_db:
try:
session_row = self._session_db.get_session(self.session_id)
if session_row:
stored_prompt = session_row.get("system_prompt") or None
except Exception:
pass # Fall through to build fresh
if stored_prompt:
# Continuing session — reuse the exact system prompt from
# the previous turn so the Anthropic cache prefix matches.
self._cached_system_prompt = stored_prompt
else:
# First turn of a new session — build from scratch.
self._cached_system_prompt = self._build_system_prompt(system_message)
# Plugin hook: on_session_start
# Fired once when a brand-new session is created (not on
# continuation). Plugins can use this to initialise
# session-scoped state (e.g. warm a memory cache).
try:
from hermes_cli.plugins import invoke_hook as _invoke_hook
_invoke_hook(
"on_session_start",
session_id=self.session_id,
model=self.model,
platform=getattr(self, "platform", None) or "",
)
except Exception as exc:
logger.warning("on_session_start hook failed: %s", exc)
# Store the system prompt snapshot in SQLite
if self._session_db:
try:
self._session_db.update_system_prompt(self.session_id, self._cached_system_prompt)
except Exception as e:
logger.debug("Session DB update_system_prompt failed: %s", e)
active_system_prompt = self._cached_system_prompt
# ── Preflight context compression ──
# Before entering the main loop, check if the loaded conversation
# history already exceeds the model's context threshold. This handles
# cases where a user switches to a model with a smaller context window
# while having a large existing session — compress proactively rather
# than waiting for an API error (which might be caught as a non-retryable
# 4xx and abort the request entirely).
if (
self.compression_enabled
and len(messages) > self.context_compressor.protect_first_n
+ self.context_compressor.protect_last_n + 1
):
# Include tool schema tokens — with many tools these can add
# 20-30K+ tokens that the old sys+msg estimate missed entirely.
_preflight_tokens = estimate_request_tokens_rough(
messages,
system_prompt=active_system_prompt or "",
tools=self.tools or None,
)
if _preflight_tokens >= self.context_compressor.threshold_tokens:
logger.info(
"Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)",
f"{_preflight_tokens:,}",
f"{self.context_compressor.threshold_tokens:,}",
self.model,
f"{self.context_compressor.context_length:,}",
)
self._emit_status(
f"📦 Preflight compression: ~{_preflight_tokens:,} tokens "
f">= {self.context_compressor.threshold_tokens:,} threshold. "
"This may take a moment."
)
# May need multiple passes for very large sessions with small
# context windows (each pass summarises the middle N turns).
for _pass in range(3):
_orig_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message, approx_tokens=_preflight_tokens,
task_id=effective_task_id,
)
if len(messages) >= _orig_len:
break # Cannot compress further
# Compression created a new session — clear the history
# reference so _flush_messages_to_session_db writes ALL
# compressed messages to the new session's SQLite, not
# skipping them because conversation_history is still the
# pre-compression length.
conversation_history = None
# Fix: reset retry counters after compression so the model
# gets a fresh budget on the compressed context. Without
# this, pre-compression retries carry over and the model
# hits "(empty)" immediately after compression-induced
# context loss.
self._empty_content_retries = 0
self._thinking_prefill_retries = 0
self._last_content_with_tools = None
self._last_content_tools_all_housekeeping = False
self._mute_post_response = False
# Re-estimate after compression
_preflight_tokens = estimate_request_tokens_rough(
messages,
system_prompt=active_system_prompt or "",
tools=self.tools or None,
)
if _preflight_tokens < self.context_compressor.threshold_tokens:
break # Under threshold
# Plugin hook: pre_llm_call
# Fired once per turn before the tool-calling loop. Plugins can
# return a dict with a ``context`` key (or a plain string) whose
# value is appended to the current turn's user message.
#
# Context is ALWAYS injected into the user message, never the
# system prompt. This preserves the prompt cache prefix — the
# system prompt stays identical across turns so cached tokens
# are reused. The system prompt is Hermes's territory; plugins
# contribute context alongside the user's input.
#
# All injected context is ephemeral (not persisted to session DB).
_plugin_user_context = ""
try:
from hermes_cli.plugins import invoke_hook as _invoke_hook
_pre_results = _invoke_hook(
"pre_llm_call",
session_id=self.session_id,
user_message=original_user_message,
conversation_history=list(messages),
is_first_turn=(not bool(conversation_history)),
model=self.model,
platform=getattr(self, "platform", None) or "",
sender_id=getattr(self, "_user_id", None) or "",
)
_ctx_parts: list[str] = []
for r in _pre_results:
if isinstance(r, dict) and r.get("context"):
_ctx_parts.append(str(r["context"]))
elif isinstance(r, str) and r.strip():
_ctx_parts.append(r)
if _ctx_parts:
_plugin_user_context = "\n\n".join(_ctx_parts)
except Exception as exc:
logger.warning("pre_llm_call hook failed: %s", exc)
# Main conversation loop
api_call_count = 0
final_response = None
interrupted = False
codex_ack_continuations = 0
length_continue_retries = 0
truncated_tool_call_retries = 0
truncated_response_prefix = ""
compression_attempts = 0
_turn_exit_reason = "unknown" # Diagnostic: why the loop ended
# Per-turn file-mutation verifier state. Keyed by resolved path;
# each failed ``write_file`` / ``patch`` call records the error
# preview. Later successful writes to the same path remove the
# entry (the model recovered). At end-of-turn, any entries still
# present are surfaced in an advisory footer so the model cannot
# over-claim success while the file is actually unchanged on disk.
self._turn_failed_file_mutations: Dict[str, Dict[str, Any]] = {}
# Record the execution thread so interrupt()/clear_interrupt() can
# scope the tool-level interrupt signal to THIS agent's thread only.
# Must be set before any thread-scoped interrupt syncing.
self._execution_thread_id = threading.current_thread().ident
# Always clear stale per-thread state from a previous turn. If an
# interrupt arrived before startup finished, preserve it and bind it
# to this execution thread now instead of dropping it on the floor.
_set_interrupt(False, self._execution_thread_id)
if self._interrupt_requested:
_set_interrupt(True, self._execution_thread_id)
self._interrupt_thread_signal_pending = False
else:
self._interrupt_message = None
self._interrupt_thread_signal_pending = False
# Notify memory providers of the new turn so cadence tracking works.
# Must happen BEFORE prefetch_all() so providers know which turn it is
# and can gate context/dialectic refresh via contextCadence/dialecticCadence.
if self._memory_manager:
try:
_turn_msg = original_user_message if isinstance(original_user_message, str) else ""
self._memory_manager.on_turn_start(self._user_turn_count, _turn_msg)
except Exception:
pass
# External memory provider: prefetch once before the tool loop.
# Reuse the cached result on every iteration to avoid re-calling
# prefetch_all() on each tool call (10 tool calls = 10x latency + cost).
# Use original_user_message (clean input) — user_message may contain
# injected skill content that bloats / breaks provider queries.
_ext_prefetch_cache = ""
if self._memory_manager:
try:
_query = original_user_message if isinstance(original_user_message, str) else ""
_ext_prefetch_cache = self._memory_manager.prefetch_all(_query) or ""
except Exception:
pass
# Optional opt-in runtime: if api_mode == codex_app_server, hand the
# turn to the codex app-server subprocess (terminal/file ops/patching
# all run inside Codex). Default Hermes path is bypassed entirely.
# See agent/transports/codex_app_server_session.py for the adapter
# and references/codex-app-server-runtime.md for the rationale.
if self.api_mode == "codex_app_server":
return self._run_codex_app_server_turn(
user_message=user_message,
original_user_message=original_user_message,
messages=messages,
effective_task_id=effective_task_id,
should_review_memory=_should_review_memory,
)
while (api_call_count < self.max_iterations and self.iteration_budget.remaining > 0) or self._budget_grace_call:
# Reset per-turn checkpoint dedup so each iteration can take one snapshot
self._checkpoint_mgr.new_turn()
# Check for interrupt request (e.g., user sent new message)
if self._interrupt_requested:
interrupted = True
_turn_exit_reason = "interrupted_by_user"
if not self.quiet_mode:
self._safe_print("\n⚡ Breaking out of tool loop due to interrupt...")
break
api_call_count += 1
self._api_call_count = api_call_count
self._touch_activity(f"starting API call #{api_call_count}")
# Grace call: the budget is exhausted but we gave the model one
# more chance. Consume the grace flag so the loop exits after
# this iteration regardless of outcome.
if self._budget_grace_call:
self._budget_grace_call = False
elif not self.iteration_budget.consume():
_turn_exit_reason = "budget_exhausted"
if not self.quiet_mode:
self._safe_print(f"\n⚠️ Iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} iterations used)")
break
# Fire step_callback for gateway hooks (agent:step event)
if self.step_callback is not None:
try:
prev_tools = []
for _idx, _m in enumerate(reversed(messages)):
if _m.get("role") == "assistant" and _m.get("tool_calls"):
_fwd_start = len(messages) - _idx
_results_by_id = {}
for _tm in messages[_fwd_start:]:
if _tm.get("role") != "tool":
break
_tcid = _tm.get("tool_call_id")
if _tcid:
_results_by_id[_tcid] = _tm.get("content", "")
prev_tools = [
{
"name": tc["function"]["name"],
"result": _results_by_id.get(tc.get("id")),
"arguments": tc["function"].get("arguments"),
}
for tc in _m["tool_calls"]
if isinstance(tc, dict)
]
break
self.step_callback(api_call_count, prev_tools)
except Exception as _step_err:
logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err)
# Track tool-calling iterations for skill nudge.
# Counter resets whenever skill_manage is actually used.
if (self._skill_nudge_interval > 0
and "skill_manage" in self.valid_tool_names):
self._iters_since_skill += 1
# ── Pre-API-call /steer drain ──────────────────────────────────
# If a /steer arrived during the previous API call (while the model
# was thinking), drain it now — before we build api_messages — so
# the model sees the steer text on THIS iteration. Without this,
# steers sent during an API call only land after the NEXT tool batch,
# which may never come if the model returns a final response.
#
# We scan backwards for the last tool-role message in the messages
# list. If found, the steer is appended there. If not (first
# iteration, no tools yet), the steer stays pending for the next
# tool batch — injecting into a user message would break role
# alternation, and there's no tool output to piggyback on.
_pre_api_steer = self._drain_pending_steer()
if _pre_api_steer:
_injected = False
for _si in range(len(messages) - 1, -1, -1):
_sm = messages[_si]
if isinstance(_sm, dict) and _sm.get("role") == "tool":
marker = f"\n\nUser guidance: {_pre_api_steer}"
existing = _sm.get("content", "")
if isinstance(existing, str):
_sm["content"] = existing + marker
else:
# Multimodal content blocks — append text block
try:
blocks = list(existing) if existing else []
blocks.append({"type": "text", "text": marker})
_sm["content"] = blocks
except Exception:
pass
_injected = True
logger.debug(
"Pre-API-call steer drain: injected into tool msg at index %d",
_si,
)
break
if not _injected:
# No tool message to inject into — put it back so
# the post-tool-execution drain picks it up later.
_lock = getattr(self, "_pending_steer_lock", None)
if _lock is not None:
with _lock:
if self._pending_steer:
self._pending_steer = self._pending_steer + "\n" + _pre_api_steer
else:
self._pending_steer = _pre_api_steer
else:
existing = getattr(self, "_pending_steer", None)
self._pending_steer = (existing + "\n" + _pre_api_steer) if existing else _pre_api_steer
# Prepare messages for API call
# If we have an ephemeral system prompt, prepend it to the messages
# Note: Reasoning is embedded in content via <think> tags for trajectory storage.
# However, providers like Moonshot AI require a separate 'reasoning_content' field
# on assistant messages with tool_calls. We handle both cases here.
request_logger = getattr(self, "logger", None) or logging.getLogger(__name__)
repaired_tool_calls = self._sanitize_tool_call_arguments(
messages,
logger=request_logger,
session_id=self.session_id,
)
if repaired_tool_calls > 0:
request_logger.info(
"Sanitized %s corrupted tool_call arguments before request (session=%s)",
repaired_tool_calls,
self.session_id or "-",
)
# Defensive: repair malformed role-alternation before API call.
# Catches cases where the history got wedged into a
# ``tool → user`` or ``user → user`` tail (e.g. after empty-
# response scaffolding was stripped and a new user message
# landed after an orphan tool result). Most providers return
# empty content on malformed sequences, which would otherwise
# retrigger the empty-retry loop indefinitely.
repaired_seq = self._repair_message_sequence(messages)
if repaired_seq > 0:
request_logger.info(
"Repaired %s message-alternation violations before request (session=%s)",
repaired_seq,
self.session_id or "-",
)
api_messages = []
for idx, msg in enumerate(messages):
api_msg = msg.copy()
# Inject ephemeral context into the current turn's user message.
# Sources: memory manager prefetch + plugin pre_llm_call hooks
# with target="user_message" (the default). Both are
# API-call-time only — the original message in `messages` is
# never mutated, so nothing leaks into session persistence.
if idx == current_turn_user_idx and msg.get("role") == "user":
_injections = []
if _ext_prefetch_cache:
_fenced = build_memory_context_block(_ext_prefetch_cache)
if _fenced:
_injections.append(_fenced)
if _plugin_user_context:
_injections.append(_plugin_user_context)
if _injections:
_base = api_msg.get("content", "")
if isinstance(_base, str):
api_msg["content"] = _base + "\n\n" + "\n\n".join(_injections)
# For ALL assistant messages, pass reasoning back to the API
# This ensures multi-turn reasoning context is preserved
self._copy_reasoning_content_for_api(msg, api_msg)
# Remove 'reasoning' field - it's for trajectory storage only
# We've copied it to 'reasoning_content' for the API above
if "reasoning" in api_msg:
api_msg.pop("reasoning")
# Remove finish_reason - not accepted by strict APIs (e.g. Mistral)
if "finish_reason" in api_msg:
api_msg.pop("finish_reason")
# Strip internal thinking-prefill marker
api_msg.pop("_thinking_prefill", None)
# Strip Codex Responses API fields (call_id, response_item_id) for
# strict providers like Mistral, Fireworks, etc. that reject unknown fields.
# Uses new dicts so the internal messages list retains the fields
# for Codex Responses compatibility.
if self._should_sanitize_tool_calls():
self._sanitize_tool_calls_for_strict_api(api_msg)
# Keep 'reasoning_details' - OpenRouter uses this for multi-turn reasoning context
# The signature field helps maintain reasoning continuity
api_messages.append(api_msg)
# Build the final system message: cached prompt + ephemeral system prompt.
# Ephemeral additions are API-call-time only (not persisted to session DB).
# External recall context is injected into the user message, not the system
# prompt, so the stable cache prefix remains unchanged.
#
# NOTE: Plugin context from pre_llm_call hooks is injected into the
# user message (see injection block above), NOT the system prompt.
# This is intentional — system prompt modifications break the prompt
# cache prefix. The system prompt is reserved for Hermes internals.
#
# Hermes invariant: the system prompt is built ONCE per session
# (cached on ``_cached_system_prompt``) and replayed verbatim on
# every turn. We send it as a single content string so the
# bytes are byte-stable across turns and upstream prompt caches
# stay warm.
effective_system = active_system_prompt or ""
if self.ephemeral_system_prompt:
effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
if effective_system:
api_messages = [{"role": "system", "content": effective_system}] + api_messages
# Inject ephemeral prefill messages right after the system prompt
# but before conversation history. Same API-call-time-only pattern.
if self.prefill_messages:
sys_offset = 1 if (api_messages and api_messages[0].get("role") == "system") else 0
for idx, pfm in enumerate(self.prefill_messages):
api_messages.insert(sys_offset + idx, pfm.copy())
# Apply Anthropic prompt caching for Claude models on native
# Anthropic, OpenRouter, and third-party Anthropic-compatible
# gateways. Auto-detected: if ``_use_prompt_caching`` is set,
# inject cache_control breakpoints (system + last 3 messages)
# to reduce input token costs by ~75% on multi-turn
# conversations.
if self._use_prompt_caching:
api_messages = apply_anthropic_cache_control(
api_messages,
cache_ttl=self._cache_ttl,
native_anthropic=self._use_native_cache_layout,
)
# Safety net: strip orphaned tool results / add stubs for missing
# results before sending to the API. Runs unconditionally — not
# gated on context_compressor — so orphans from session loading or
# manual message manipulation are always caught.
api_messages = self._sanitize_api_messages(api_messages)
# Drop thinking-only assistant turns (reasoning but no visible
# output and no tool_calls) and merge any adjacent user messages
# left behind. Prevents Anthropic 400s ("The final block in an
# assistant message cannot be `thinking`.") and equivalent errors
# from third-party Anthropic-compatible gateways that can't replay
# a thinking-only turn. Runs on the per-call copy only — the
# stored conversation history keeps the reasoning block for the
# UI transcript and session persistence.
api_messages = self._drop_thinking_only_and_merge_users(api_messages)
# Normalize message whitespace and tool-call JSON for consistent
# prefix matching. Ensures bit-perfect prefixes across turns,
# which enables KV cache reuse on local inference servers
# (llama.cpp, vLLM, Ollama) and improves cache hit rates for
# cloud providers. Operates on api_messages (the API copy) so
# the original conversation history in `messages` is untouched.
for am in api_messages:
if isinstance(am.get("content"), str):
am["content"] = am["content"].strip()
for am in api_messages:
tcs = am.get("tool_calls")
if not tcs:
continue
new_tcs = []
for tc in tcs:
if isinstance(tc, dict) and "function" in tc:
try:
args_obj = json.loads(tc["function"]["arguments"])
tc = {**tc, "function": {
**tc["function"],
"arguments": json.dumps(
args_obj, separators=(",", ":"),
sort_keys=True,
),
}}
except Exception:
tc["function"]["arguments"] = _repair_tool_call_arguments(
tc["function"]["arguments"],
tc["function"].get("name", "?"),
)
new_tcs.append(tc)
am["tool_calls"] = new_tcs
# Proactively strip any surrogate characters before the API call.
# Models served via Ollama (Kimi K2.5, GLM-5, Qwen) can return
# lone surrogates (U+D800-U+DFFF) that crash json.dumps() inside
# the OpenAI SDK. Sanitizing here prevents the 3-retry cycle.
_sanitize_messages_surrogates(api_messages)
# Calculate approximate request size for logging
total_chars = sum(len(str(msg)) for msg in api_messages)
approx_tokens = estimate_messages_tokens_rough(api_messages)
# Thinking spinner for quiet mode (animated during API call)
thinking_spinner = None
if not self.quiet_mode:
self._vprint(f"\n{self.log_prefix}🔄 Making API call #{api_call_count}/{self.max_iterations}...")
self._vprint(f"{self.log_prefix} 📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)")
self._vprint(f"{self.log_prefix} 🔧 Available tools: {len(self.tools) if self.tools else 0}")
else:
# Animated thinking spinner in quiet mode
face = random.choice(KawaiiSpinner.get_thinking_faces())
verb = random.choice(KawaiiSpinner.get_thinking_verbs())
if self.thinking_callback:
# CLI TUI mode: use prompt_toolkit widget instead of raw spinner
# (works in both streaming and non-streaming modes)
self.thinking_callback(f"{face} {verb}...")
elif not self._has_stream_consumers() and self._should_start_quiet_spinner():
# Raw KawaiiSpinner only when no streaming consumers and the
# spinner output has a safe sink.
spinner_type = random.choice(['brain', 'sparkle', 'pulse', 'moon', 'star'])
thinking_spinner = KawaiiSpinner(f"{face} {verb}...", spinner_type=spinner_type, print_fn=self._print_fn)
thinking_spinner.start()
# Log request details if verbose
if self.verbose_logging:
logging.debug(f"API Request - Model: {self.model}, Messages: {len(messages)}, Tools: {len(self.tools) if self.tools else 0}")
logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}")
logging.debug(f"Total message size: ~{approx_tokens:,} tokens")
api_start_time = time.time()
retry_count = 0
max_retries = self._api_max_retries
primary_recovery_attempted = False
max_compression_attempts = 3
codex_auth_retry_attempted=False
anthropic_auth_retry_attempted=False
nous_auth_retry_attempted=False
copilot_auth_retry_attempted=False
thinking_sig_retry_attempted = False
image_shrink_retry_attempted = False
oauth_1m_beta_retry_attempted = False
llama_cpp_grammar_retry_attempted = False
has_retried_429 = False
restart_with_compressed_messages = False
restart_with_length_continuation = False
finish_reason = "stop"
response = None # Guard against UnboundLocalError if all retries fail
api_kwargs = None # Guard against UnboundLocalError in except handler
while retry_count < max_retries:
# ── Nous Portal rate limit guard ──────────────────────
# If another session already recorded that Nous is rate-
# limited, skip the API call entirely. Each attempt
# (including SDK-level retries) counts against RPH and
# deepens the rate limit hole.
if self.provider == "nous":
try:
from agent.nous_rate_guard import (
nous_rate_limit_remaining,
format_remaining as _fmt_nous_remaining,
)
_nous_remaining = nous_rate_limit_remaining()
if _nous_remaining is not None and _nous_remaining > 0:
_nous_msg = (
f"Nous Portal rate limit active — "
f"resets in {_fmt_nous_remaining(_nous_remaining)}."
)
self._vprint(
f"{self.log_prefix}{_nous_msg} Trying fallback...",
force=True,
)
self._emit_status(f"{_nous_msg}")
if self._try_activate_fallback():
retry_count = 0
compression_attempts = 0
primary_recovery_attempted = False
continue
# No fallback available — return with clear message
self._persist_session(messages, conversation_history)
return {
"final_response": (
f"{_nous_msg}\n\n"
"No fallback provider available. "
"Try again after the reset, or add a "
"fallback provider in config.yaml."
),
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"failed": True,
"error": _nous_msg,
}
except ImportError:
pass
except Exception:
pass # Never let rate guard break the agent loop
try:
self._reset_stream_delivery_tracking()
api_kwargs = self._build_api_kwargs(api_messages)
if self._force_ascii_payload:
_sanitize_structure_non_ascii(api_kwargs)
if self.api_mode == "codex_responses":
api_kwargs = self._get_transport().preflight_kwargs(api_kwargs, allow_stream=False)
try:
from hermes_cli.plugins import invoke_hook as _invoke_hook
_invoke_hook(
"pre_api_request",
task_id=effective_task_id,
session_id=self.session_id or "",
platform=self.platform or "",
model=self.model,
provider=self.provider,
base_url=self.base_url,
api_mode=self.api_mode,
api_call_count=api_call_count,
message_count=len(api_messages),
tool_count=len(self.tools or []),
approx_input_tokens=approx_tokens,
request_char_count=total_chars,
max_tokens=self.max_tokens,
)
except Exception:
pass
if env_var_enabled("HERMES_DUMP_REQUESTS"):
self._dump_api_request_debug(api_kwargs, reason="preflight")
# Always prefer the streaming path — even without stream
# consumers. Streaming gives us fine-grained health
# checking (90s stale-stream detection, 60s read timeout)
# that the non-streaming path lacks. Without this,
# subagents and other quiet-mode callers can hang
# indefinitely when the provider keeps the connection
# alive with SSE pings but never delivers a response.
# The streaming path is a no-op for callbacks when no
# consumers are registered, and falls back to non-
# streaming automatically if the provider doesn't
# support it.
def _stop_spinner():
nonlocal thinking_spinner
if thinking_spinner:
thinking_spinner.stop("")
thinking_spinner = None
if self.thinking_callback:
self.thinking_callback("")
_use_streaming = True
# Provider signaled "stream not supported" on a previous
# attempt — switch to non-streaming for the rest of this
# session instead of re-failing every retry.
if getattr(self, "_disable_streaming", False):
_use_streaming = False
# CopilotACPClient communicates via subprocess stdio and
# returns a plain SimpleNamespace — not an iterable
# stream. Mirror the ACP exclusion used for Responses
# API upgrade (lines ~1083-1085).
elif (
self.provider == "copilot-acp"
or str(self.base_url or "").lower().startswith("acp://copilot")
or str(self.base_url or "").lower().startswith("acp+tcp://")
):
_use_streaming = False
elif not self._has_stream_consumers():
# No display/TTS consumer. Still prefer streaming for
# health checking, but skip for Mock clients in tests
# (mocks return SimpleNamespace, not stream iterators).
from unittest.mock import Mock
if isinstance(getattr(self, "client", None), Mock):
_use_streaming = False
if _use_streaming:
response = self._interruptible_streaming_api_call(
api_kwargs, on_first_delta=_stop_spinner
)
else:
response = self._interruptible_api_call(api_kwargs)
api_duration = time.time() - api_start_time
# Stop thinking spinner silently -- the response box or tool
# execution messages that follow are more informative.
if thinking_spinner:
thinking_spinner.stop("")
thinking_spinner = None
if self.thinking_callback:
self.thinking_callback("")
if not self.quiet_mode:
self._vprint(f"{self.log_prefix}⏱️ API call completed in {api_duration:.2f}s")
if self.verbose_logging:
# Log response with provider info if available
resp_model = getattr(response, 'model', 'N/A') if response else 'N/A'
logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}")
# Validate response shape before proceeding
response_invalid = False
error_details = []
if self.api_mode == "codex_responses":
_ct_v = self._get_transport()
if not _ct_v.validate_response(response):
if response is None:
response_invalid = True
error_details.append("response is None")
else:
# Provider returned a terminal failure (e.g. quota exhaustion).
# Treat as invalid so the fallback chain is triggered instead of
# letting the error bubble up outside the retry/fallback loop.
_codex_resp_status = str(getattr(response, "status", "") or "").strip().lower()
if _codex_resp_status in {"failed", "cancelled"}:
_codex_error_obj = getattr(response, "error", None)
_codex_error_msg = (
_codex_error_obj.get("message") if isinstance(_codex_error_obj, dict)
else str(_codex_error_obj) if _codex_error_obj
else f"Responses API returned status '{_codex_resp_status}'"
)
logging.warning(
"Codex response status='%s' (error=%s). Routing to fallback. %s",
_codex_resp_status, _codex_error_msg,
self._client_log_context(),
)
response_invalid = True
error_details.append(f"response.status={_codex_resp_status}: {_codex_error_msg}")
else:
# output_text fallback: stream backfill may have failed
# but normalize can still recover from output_text
_out_text = getattr(response, "output_text", None)
_out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else ""
if _out_text_stripped:
logger.debug(
"Codex response.output is empty but output_text is present "
"(%d chars); deferring to normalization.",
len(_out_text_stripped),
)
else:
_resp_status = getattr(response, "status", None)
_resp_incomplete = getattr(response, "incomplete_details", None)
logger.warning(
"Codex response.output is empty after stream backfill "
"(status=%s, incomplete_details=%s, model=%s). %s",
_resp_status, _resp_incomplete,
getattr(response, "model", None),
f"api_mode={self.api_mode} provider={self.provider}",
)
response_invalid = True
error_details.append("response.output is empty")
elif self.api_mode == "anthropic_messages":
_tv = self._get_transport()
if not _tv.validate_response(response):
response_invalid = True
if response is None:
error_details.append("response is None")
else:
error_details.append("response.content invalid (not a non-empty list)")
elif self.api_mode == "bedrock_converse":
_btv = self._get_transport()
if not _btv.validate_response(response):
response_invalid = True
if response is None:
error_details.append("response is None")
else:
error_details.append("Bedrock response invalid (no output or choices)")
else:
_ctv = self._get_transport()
if not _ctv.validate_response(response):
response_invalid = True
if response is None:
error_details.append("response is None")
elif not hasattr(response, 'choices'):
error_details.append("response has no 'choices' attribute")
elif response.choices is None:
error_details.append("response.choices is None")
else:
error_details.append("response.choices is empty")
if response_invalid:
# Stop spinner before printing error messages
if thinking_spinner:
thinking_spinner.stop("(´;ω;`) oops, retrying...")
thinking_spinner = None
if self.thinking_callback:
self.thinking_callback("")
# Invalid response — could be rate limiting, provider timeout,
# upstream server error, or malformed response.
retry_count += 1
# Eager fallback: empty/malformed responses are a common
# rate-limit symptom. Switch to fallback immediately
# rather than retrying with extended backoff.
if self._fallback_index < len(self._fallback_chain):
self._emit_status("⚠️ Empty/malformed response — switching to fallback...")
if self._try_activate_fallback():
retry_count = 0
compression_attempts = 0
primary_recovery_attempted = False
continue
# Check for error field in response (some providers include this)
error_msg = "Unknown"
provider_name = "Unknown"
if response and hasattr(response, 'error') and response.error:
error_msg = str(response.error)
# Try to extract provider from error metadata
if hasattr(response.error, 'metadata') and response.error.metadata:
provider_name = response.error.metadata.get('provider_name', 'Unknown')
elif response and hasattr(response, 'message') and response.message:
error_msg = str(response.message)
# Try to get provider from model field (OpenRouter often returns actual model used)
if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model:
provider_name = f"model={response.model}"
# Check for x-openrouter-provider or similar metadata
if provider_name == "Unknown" and response:
# Log all response attributes for debugging
resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')}
if self.verbose_logging:
logging.debug(f"Response attributes for invalid response: {resp_attrs}")
# Extract error code from response for contextual diagnostics
_resp_error_code = None
if response and hasattr(response, 'error') and response.error:
_code_raw = getattr(response.error, 'code', None)
if _code_raw is None and isinstance(response.error, dict):
_code_raw = response.error.get('code')
if _code_raw is not None:
try:
_resp_error_code = int(_code_raw)
except (TypeError, ValueError):
pass
# Build a human-readable failure hint from the error code
# and response time, instead of always assuming rate limiting.
if _resp_error_code == 524:
_failure_hint = f"upstream provider timed out (Cloudflare 524, {api_duration:.0f}s)"
elif _resp_error_code == 504:
_failure_hint = f"upstream gateway timeout (504, {api_duration:.0f}s)"
elif _resp_error_code == 429:
_failure_hint = f"rate limited by upstream provider (429)"
elif _resp_error_code in {500, 502}:
_failure_hint = f"upstream server error ({_resp_error_code}, {api_duration:.0f}s)"
elif _resp_error_code in {503, 529}:
_failure_hint = f"upstream provider overloaded ({_resp_error_code})"
elif _resp_error_code is not None:
_failure_hint = f"upstream error (code {_resp_error_code}, {api_duration:.0f}s)"
elif api_duration < 10:
_failure_hint = f"fast response ({api_duration:.1f}s) — likely rate limited"
elif api_duration > 60:
_failure_hint = f"slow response ({api_duration:.0f}s) — likely upstream timeout"
else:
_failure_hint = f"response time {api_duration:.1f}s"
self._vprint(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True)
self._vprint(f"{self.log_prefix} 🏢 Provider: {provider_name}", force=True)
cleaned_provider_error = self._clean_error_message(error_msg)
self._vprint(f"{self.log_prefix} 📝 Provider message: {cleaned_provider_error}", force=True)
self._vprint(f"{self.log_prefix} ⏱️ {_failure_hint}", force=True)
if retry_count >= max_retries:
# Try fallback before giving up
self._emit_status(f"⚠️ Max retries ({max_retries}) for invalid responses — trying fallback...")
if self._try_activate_fallback():
retry_count = 0
compression_attempts = 0
primary_recovery_attempted = False
continue
self._emit_status(f"❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.")
logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.")
self._persist_session(messages, conversation_history)
return {
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Invalid API response after {max_retries} retries: {_failure_hint}",
"failed": True # Mark as failure for filtering
}
# Backoff before retry — jittered exponential: 5s base, 120s cap
wait_time = jittered_backoff(retry_count, base_delay=5.0, max_delay=120.0)
self._vprint(f"{self.log_prefix}⏳ Retrying in {wait_time:.1f}s ({_failure_hint})...", force=True)
logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}")
# Sleep in small increments to stay responsive to interrupts
sleep_end = time.time() + wait_time
_backoff_touch_counter = 0
while time.time() < sleep_end:
if self._interrupt_requested:
self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
self._persist_session(messages, conversation_history)
self.clear_interrupt()
return {
"final_response": f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries}).",
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"interrupted": True,
}
time.sleep(0.2)
# Touch activity every ~30s so the gateway's inactivity
# monitor knows we're alive during backoff waits.
_backoff_touch_counter += 1
if _backoff_touch_counter % 150 == 0: # 150 × 0.2s = 30s
self._touch_activity(
f"retry backoff ({retry_count}/{max_retries}), "
f"{int(sleep_end - time.time())}s remaining"
)
continue # Retry the API call
# Check finish_reason before proceeding
if self.api_mode == "codex_responses":
status = getattr(response, "status", None)
incomplete_details = getattr(response, "incomplete_details", None)
incomplete_reason = None
if isinstance(incomplete_details, dict):
incomplete_reason = incomplete_details.get("reason")
else:
incomplete_reason = getattr(incomplete_details, "reason", None)
if status == "incomplete" and incomplete_reason in {"max_output_tokens", "length"}:
finish_reason = "length"
else:
finish_reason = "stop"
elif self.api_mode == "anthropic_messages":
_tfr = self._get_transport()
finish_reason = _tfr.map_finish_reason(response.stop_reason)
elif self.api_mode == "bedrock_converse":
# Bedrock response already normalized at dispatch — use transport
_bt_fr = self._get_transport()
_bedrock_result = _bt_fr.normalize_response(response)
finish_reason = _bedrock_result.finish_reason
else:
_cc_fr = self._get_transport()
_finish_result = _cc_fr.normalize_response(response)
finish_reason = _finish_result.finish_reason
assistant_message = _finish_result
if self._should_treat_stop_as_truncated(
finish_reason,
assistant_message,
messages,
):
self._vprint(
f"{self.log_prefix}⚠️ Treating suspicious Ollama/GLM stop response as truncated",
force=True,
)
finish_reason = "length"
if finish_reason == "length":
self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True)
# Normalize the truncated response to a single OpenAI-style
# message shape so text-continuation and tool-call retry
# work uniformly across chat_completions, bedrock_converse,
# and anthropic_messages. For Anthropic we use the same
# adapter the agent loop already relies on so the rebuilt
# interim assistant message is byte-identical to what
# would have been appended in the non-truncated path.
_trunc_msg = None
_trunc_transport = self._get_transport()
if self.api_mode == "anthropic_messages":
_trunc_result = _trunc_transport.normalize_response(
response, strip_tool_prefix=self._is_anthropic_oauth
)
else:
_trunc_result = _trunc_transport.normalize_response(response)
_trunc_msg = _trunc_result
_trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None
_trunc_has_tool_calls = bool(getattr(_trunc_msg, "tool_calls", None)) if _trunc_msg else False
# ── Detect thinking-budget exhaustion ──────────────
# When the model spends ALL output tokens on reasoning
# and has none left for the response, continuation
# retries are pointless. Detect this early and give a
# targeted error instead of wasting 3 API calls.
# A response is "thinking exhausted" only when the model
# actually produced reasoning blocks but no visible text after
# them. Models that do not use <think> tags (e.g. GLM-4.7 on
# NVIDIA Build, minimax) may return content=None or an empty
# string for unrelated reasons — treat those as normal
# truncations that deserve continuation retries, not as
# thinking-budget exhaustion.
_has_think_tags = bool(
_trunc_content and re.search(
r'<(?:think|thinking|reasoning|REASONING_SCRATCHPAD)[^>]*>',
_trunc_content,
re.IGNORECASE,
)
)
_thinking_exhausted = (
not _trunc_has_tool_calls
and _has_think_tags
and (
(_trunc_content is not None and not self._has_content_after_think_block(_trunc_content))
or _trunc_content is None
)
)
if _thinking_exhausted:
_exhaust_error = (
"Model used all output tokens on reasoning with none left "
"for the response. Try lowering reasoning effort or "
"increasing max_tokens."
)
self._vprint(
f"{self.log_prefix}💭 Reasoning exhausted the output token budget — "
f"no visible response was produced.",
force=True,
)
# Return a user-friendly message as the response so
# CLI (response box) and gateway (chat message) both
# display it naturally instead of a suppressed error.
_exhaust_response = (
"⚠️ **Thinking Budget Exhausted**\n\n"
"The model used all its output tokens on reasoning "
"and had none left for the actual response.\n\n"
"To fix this:\n"
"→ Lower reasoning effort: `/thinkon low` or `/thinkon minimal`\n"
"→ Or switch to a larger/non-reasoning model with `/model`"
)
self._cleanup_task_resources(effective_task_id)
self._persist_session(messages, conversation_history)
return {
"final_response": _exhaust_response,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": _exhaust_error,
}
if self.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
assistant_message = _trunc_msg
if assistant_message is not None and not _trunc_has_tool_calls:
length_continue_retries += 1
interim_msg = self._build_assistant_message(assistant_message, finish_reason)
messages.append(interim_msg)
if assistant_message.content:
truncated_response_prefix += assistant_message.content
if length_continue_retries < 3:
self._vprint(
f"{self.log_prefix}↻ Requesting continuation "
f"({length_continue_retries}/3)..."
)
continue_msg = {
"role": "user",
"content": (
"[System: Your previous response was truncated by the output "
"length limit. Continue exactly where you left off. Do not "
"restart or repeat prior text. Finish the answer directly.]"
),
}
messages.append(continue_msg)
self._session_messages = messages
self._save_session_log(messages)
restart_with_length_continuation = True
break
partial_response = self._strip_think_blocks(truncated_response_prefix).strip()
self._cleanup_task_resources(effective_task_id)
self._persist_session(messages, conversation_history)
return {
"final_response": partial_response or None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": "Response remained truncated after 3 continuation attempts",
}
if self.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
assistant_message = _trunc_msg
if assistant_message is not None and _trunc_has_tool_calls:
if truncated_tool_call_retries < 1:
truncated_tool_call_retries += 1
self._vprint(
f"{self.log_prefix}⚠️ Truncated tool call detected — retrying API call...",
force=True,
)
# Don't append the broken response to messages;
# just re-run the same API call from the current
# message state, giving the model another chance.
continue
self._vprint(
f"{self.log_prefix}⚠️ Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
force=True,
)
self._cleanup_task_resources(effective_task_id)
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": "Response truncated due to output length limit",
}
# If we have prior messages, roll back to last complete state
if len(messages) > 1:
self._vprint(f"{self.log_prefix} ⏪ Rolling back to last complete assistant turn")
rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
self._cleanup_task_resources(effective_task_id)
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": rolled_back_messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": "Response truncated due to output length limit"
}
else:
# First message was truncated - mark as failed
self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover", force=True)
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"failed": True,
"error": "First response truncated due to output length limit"
}
# Track actual token usage from response for context management
if hasattr(response, 'usage') and response.usage:
canonical_usage = normalize_usage(
response.usage,
provider=self.provider,
api_mode=self.api_mode,
)
prompt_tokens = canonical_usage.prompt_tokens
completion_tokens = canonical_usage.output_tokens
total_tokens = canonical_usage.total_tokens
usage_dict = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": total_tokens,
}
self.context_compressor.update_from_response(usage_dict)
# Cache discovered context length after successful call.
# Only persist limits confirmed by the provider (parsed
# from the error message), not guessed probe tiers.
if getattr(self.context_compressor, "_context_probed", False):
ctx = self.context_compressor.context_length
if getattr(self.context_compressor, "_context_probe_persistable", False):
save_context_length(self.model, self.base_url, ctx)
self._safe_print(f"{self.log_prefix}💾 Cached context length: {ctx:,} tokens for {self.model}")
self.context_compressor._context_probed = False
self.context_compressor._context_probe_persistable = False
self.session_prompt_tokens += prompt_tokens
self.session_completion_tokens += completion_tokens
self.session_total_tokens += total_tokens
self.session_api_calls += 1
self.session_input_tokens += canonical_usage.input_tokens
self.session_output_tokens += canonical_usage.output_tokens
self.session_cache_read_tokens += canonical_usage.cache_read_tokens
self.session_cache_write_tokens += canonical_usage.cache_write_tokens
self.session_reasoning_tokens += canonical_usage.reasoning_tokens
# Log API call details for debugging/observability
_cache_pct = ""
if canonical_usage.cache_read_tokens and prompt_tokens:
_cache_pct = f" cache={canonical_usage.cache_read_tokens}/{prompt_tokens} ({100*canonical_usage.cache_read_tokens/prompt_tokens:.0f}%)"
logger.info(
"API call #%d: model=%s provider=%s in=%d out=%d total=%d latency=%.1fs%s",
self.session_api_calls, self.model, self.provider or "unknown",
prompt_tokens, completion_tokens, total_tokens,
api_duration, _cache_pct,
)
cost_result = estimate_usage_cost(
self.model,
canonical_usage,
provider=self.provider,
base_url=self.base_url,
api_key=getattr(self, "api_key", ""),
)
if cost_result.amount_usd is not None:
self.session_estimated_cost_usd += float(cost_result.amount_usd)
self.session_cost_status = cost_result.status
self.session_cost_source = cost_result.source
# Persist token counts to session DB for /insights.
# Do this for every platform with a session_id so non-CLI
# sessions (gateway, cron, delegated runs) cannot lose
# token/accounting data if a higher-level persistence path
# is skipped or fails. Gateway/session-store writes use
# absolute totals, so they safely overwrite these per-call
# deltas instead of double-counting them.
if self._session_db and self.session_id:
try:
# Ensure the session row exists before attempting UPDATE.
# Under concurrent load (cron/kanban), the initial
# _ensure_db_session() may have failed due to SQLite
# locking. Retry here so per-call token deltas are
# not silently lost (UPDATE on a non-existent row
# affects 0 rows without error).
if not self._session_db_created:
self._ensure_db_session()
self._session_db.update_token_counts(
self.session_id,
input_tokens=canonical_usage.input_tokens,
output_tokens=canonical_usage.output_tokens,
cache_read_tokens=canonical_usage.cache_read_tokens,
cache_write_tokens=canonical_usage.cache_write_tokens,
reasoning_tokens=canonical_usage.reasoning_tokens,
estimated_cost_usd=float(cost_result.amount_usd)
if cost_result.amount_usd is not None else None,
cost_status=cost_result.status,
cost_source=cost_result.source,
billing_provider=self.provider,
billing_base_url=self.base_url,
billing_mode="subscription_included"
if cost_result.status == "included" else None,
model=self.model,
api_call_count=1,
)
except Exception as e:
# Log token persistence failures so they're
# visible in agent.log — silent loss here is
# the root cause of undercounted analytics.
logger.debug(
"Token persistence failed (session=%s, tokens=%d): %s",
self.session_id, total_tokens, e,
)
if self.verbose_logging:
logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}")
# Surface cache hit stats for any provider that reports
# them — not just those where we inject cache_control
# markers. OpenAI/Kimi/DeepSeek/Qwen all do automatic
# server-side prefix caching and return
# ``prompt_tokens_details.cached_tokens``; users
# previously could not see their cache % because this
# line was gated on ``_use_prompt_caching``, which is
# only True for Anthropic-style marker injection.
# ``canonical_usage`` is already normalised from all
# three API shapes (Anthropic / Codex / OpenAI-chat)
# so we can rely on its values directly.
cached = canonical_usage.cache_read_tokens
written = canonical_usage.cache_write_tokens
prompt = usage_dict["prompt_tokens"]
if (cached or written) and not self.quiet_mode:
hit_pct = (cached / prompt * 100) if prompt > 0 else 0
self._vprint(
f"{self.log_prefix} 💾 Cache: "
f"{cached:,}/{prompt:,} tokens "
f"({hit_pct:.0f}% hit, {written:,} written)"
)
has_retried_429 = False # Reset on success
# Clear Nous rate limit state on successful request —
# proves the limit has reset and other sessions can
# resume hitting Nous.
if self.provider == "nous":
try:
from agent.nous_rate_guard import clear_nous_rate_limit
clear_nous_rate_limit()
except Exception:
pass
self._touch_activity(f"API call #{api_call_count} completed")
break # Success, exit retry loop
except InterruptedError:
if thinking_spinner:
thinking_spinner.stop("")
thinking_spinner = None
if self.thinking_callback:
self.thinking_callback("")
api_elapsed = time.time() - api_start_time
self._vprint(f"{self.log_prefix}⚡ Interrupted during API call.", force=True)
self._persist_session(messages, conversation_history)
interrupted = True
final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)."
break
except Exception as api_error:
# Stop spinner before printing error messages
if thinking_spinner:
thinking_spinner.stop("(╥_╥) error, retrying...")
thinking_spinner = None
if self.thinking_callback:
self.thinking_callback("")
# -----------------------------------------------------------
# UnicodeEncodeError recovery. Two common causes:
# 1. Lone surrogates (U+D800..U+DFFF) from clipboard paste
# (Google Docs, rich-text editors) — sanitize and retry.
# 2. ASCII codec on systems with LANG=C or non-UTF-8 locale
# (e.g. Chromebooks) — any non-ASCII character fails.
# Detect via the error message mentioning 'ascii' codec.
# We sanitize messages in-place and may retry twice:
# first to strip surrogates, then once more for pure
# ASCII-only locale sanitization if needed.
# -----------------------------------------------------------
if isinstance(api_error, UnicodeEncodeError) and getattr(self, '_unicode_sanitization_passes', 0) < 2:
_err_str = str(api_error).lower()
_is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str
# Detect surrogate errors — utf-8 codec refusing to
# encode U+D800..U+DFFF. The error text is:
# "'utf-8' codec can't encode characters in position
# N-M: surrogates not allowed"
_is_surrogate_error = (
"surrogate" in _err_str
or ("'utf-8'" in _err_str and not _is_ascii_codec)
)
# Sanitize surrogates from both the canonical `messages`
# list AND `api_messages` (the API-copy, which may carry
# `reasoning_content`/`reasoning_details` transformed
# from `reasoning` — fields the canonical list doesn't
# have directly). Also clean `api_kwargs` if built and
# `prefill_messages` if present. Mirrors the ASCII
# codec recovery below.
_surrogates_found = _sanitize_messages_surrogates(messages)
if isinstance(api_messages, list):
if _sanitize_messages_surrogates(api_messages):
_surrogates_found = True
if isinstance(api_kwargs, dict):
if _sanitize_structure_surrogates(api_kwargs):
_surrogates_found = True
if isinstance(getattr(self, "prefill_messages", None), list):
if _sanitize_messages_surrogates(self.prefill_messages):
_surrogates_found = True
# Gate the retry on the error type, not on whether we
# found anything — _force_ascii_payload / the extended
# surrogate walker above cover all known paths, but a
# new transformed field could still slip through. If
# the error was a surrogate encode failure, always let
# the retry run; the proactive sanitizer at line ~8781
# runs again on the next iteration. Bounded by
# _unicode_sanitization_passes < 2 (outer guard).
if _surrogates_found or _is_surrogate_error:
self._unicode_sanitization_passes += 1
if _surrogates_found:
self._vprint(
f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...",
force=True,
)
else:
self._vprint(
f"{self.log_prefix}⚠️ Surrogate encoding error — retrying after full-payload sanitization...",
force=True,
)
continue
if _is_ascii_codec:
self._force_ascii_payload = True
# ASCII codec: the system encoding can't handle
# non-ASCII characters at all. Sanitize all
# non-ASCII content from messages/tool schemas and retry.
# Sanitize both the canonical `messages` list and
# `api_messages` (the API-copy built before the retry
# loop, which may contain extra fields like
# reasoning_content that are not in `messages`).
_messages_sanitized = _sanitize_messages_non_ascii(messages)
if isinstance(api_messages, list):
_sanitize_messages_non_ascii(api_messages)
# Also sanitize the last api_kwargs if already built,
# so a leftover non-ASCII value in a transformed field
# (e.g. extra_body, reasoning_content) doesn't survive
# into the next attempt via _build_api_kwargs cache paths.
if isinstance(api_kwargs, dict):
_sanitize_structure_non_ascii(api_kwargs)
_prefill_sanitized = False
if isinstance(getattr(self, "prefill_messages", None), list):
_prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages)
_tools_sanitized = False
if isinstance(getattr(self, "tools", None), list):
_tools_sanitized = _sanitize_tools_non_ascii(self.tools)
_system_sanitized = False
if isinstance(active_system_prompt, str):
_sanitized_system = _strip_non_ascii(active_system_prompt)
if _sanitized_system != active_system_prompt:
active_system_prompt = _sanitized_system
self._cached_system_prompt = _sanitized_system
_system_sanitized = True
if isinstance(getattr(self, "ephemeral_system_prompt", None), str):
_sanitized_ephemeral = _strip_non_ascii(self.ephemeral_system_prompt)
if _sanitized_ephemeral != self.ephemeral_system_prompt:
self.ephemeral_system_prompt = _sanitized_ephemeral
_system_sanitized = True
_headers_sanitized = False
_default_headers = (
self._client_kwargs.get("default_headers")
if isinstance(getattr(self, "_client_kwargs", None), dict)
else None
)
if isinstance(_default_headers, dict):
_headers_sanitized = _sanitize_structure_non_ascii(_default_headers)
# Sanitize the API key — non-ASCII characters in
# credentials (e.g. ʋ instead of v from a bad
# copy-paste) cause httpx to fail when encoding
# the Authorization header as ASCII. This is the
# most common cause of persistent UnicodeEncodeError
# that survives message/tool sanitization (#6843).
_credential_sanitized = False
_raw_key = getattr(self, "api_key", None) or ""
if _raw_key:
_clean_key = _strip_non_ascii(_raw_key)
if _clean_key != _raw_key:
self.api_key = _clean_key
if isinstance(getattr(self, "_client_kwargs", None), dict):
self._client_kwargs["api_key"] = _clean_key
# Also update the live client — it holds its
# own copy of api_key which auth_headers reads
# dynamically on every request.
if getattr(self, "client", None) is not None and hasattr(self.client, "api_key"):
self.client.api_key = _clean_key
_credential_sanitized = True
self._vprint(
f"{self.log_prefix}⚠️ API key contained non-ASCII characters "
f"(bad copy-paste?) — stripped them. If auth fails, "
f"re-copy the key from your provider's dashboard.",
force=True,
)
# Always retry on ASCII codec detection —
# _force_ascii_payload guarantees the full
# api_kwargs payload is sanitized on the
# next iteration (line ~8475). Even when
# per-component checks above find nothing
# (e.g. non-ASCII only in api_messages'
# reasoning_content), the flag catches it.
# Bounded by _unicode_sanitization_passes < 2.
self._unicode_sanitization_passes += 1
_any_sanitized = (
_messages_sanitized
or _prefill_sanitized
or _tools_sanitized
or _system_sanitized
or _headers_sanitized
or _credential_sanitized
)
if _any_sanitized:
self._vprint(
f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from request payload. Retrying...",
force=True,
)
else:
self._vprint(
f"{self.log_prefix}⚠️ System encoding is ASCII — enabling full-payload sanitization for retry...",
force=True,
)
continue
# ── Image-rejection recovery ──────────────────────────────
# Some providers (mlx-lm, text-only endpoints, text-only
# fallbacks on multimodal models) reject any message that
# contains image_url content with a 4xx error like
# "Only 'text' content type is supported." On first hit,
# strip all images from the message list, mark the session
# as vision-unsupported, and retry with text only.
#
# Detection is best-effort English phrase matching — a
# locale-translated or heavily-reworded upstream error
# will bypass this guard and fall through to the normal
# error handler. Expand the phrase list when new
# provider wordings are observed in the wild.
_err_body = ""
try:
_err_body = str(getattr(api_error, "body", None) or
getattr(api_error, "message", None) or
str(api_error))
except Exception:
pass
_err_status = getattr(api_error, "status_code", None)
_IMAGE_REJECTION_PHRASES = (
"only 'text' content type is supported",
"only text content type is supported",
"image_url is not supported",
"image content is not supported",
"multimodal is not supported",
"multimodal content is not supported",
"multimodal input is not supported",
"vision is not supported",
"vision input is not supported",
"does not support images",
"does not support image input",
"does not support multimodal",
"does not support vision",
"model does not support image",
# ChatGPT-account Codex backend
# (https://chatgpt.com/backend-api/codex) rejects
# data:image/...base64 URLs in input_image fields
# with HTTP 400 "Invalid 'input[N].content[K].image_url'.
# Expected a valid URL, but got a value with an
# invalid format." The OpenAI Responses API on the
# public endpoint accepts data URLs, but the
# ChatGPT-account variant does not. Without this
# phrase the agent cascaded into compression /
# context-too-large recovery instead of just
# stripping the images. Match is narrow on
# purpose — keyed on the field-path apostrophe so
# we don't false-trip on other URL validation
# errors. (issue #23570)
"image_url'. expected",
# DeepSeek's OpenAI-compatible API reports text-only
# request-body variants as:
# "unknown variant `image_url`, expected `text`".
"unknown variant `image_url`, expected `text`",
"unknown variant image_url, expected text",
)
_err_lower = _err_body.lower()
_looks_like_image_rejection = any(
p in _err_lower for p in _IMAGE_REJECTION_PHRASES
)
# 4xx-only gate: never interpret 5xx/timeout as "server
# said no to images" — those are transient and must
# route to the normal retry path.
_status_ok = _err_status is None or (400 <= int(_err_status) < 500)
if (
getattr(self, "_vision_supported", True)
and _looks_like_image_rejection
and _status_ok
):
self._vision_supported = False
_imgs_removed = _strip_images_from_messages(messages)
if isinstance(api_messages, list):
_strip_images_from_messages(api_messages)
self._vprint(
f"{self.log_prefix}⚠️ Server rejected image content — "
f"switching to text-only mode for this session"
+ (". Stripped images from history and retrying." if _imgs_removed else "."),
force=True,
)
continue
status_code = getattr(api_error, "status_code", None)
error_context = self._extract_api_error_context(api_error)
# ── Classify the error for structured recovery decisions ──
_compressor = getattr(self, "context_compressor", None)
_ctx_len = getattr(_compressor, "context_length", 200000) if _compressor else 200000
classified = classify_api_error(
api_error,
provider=getattr(self, "provider", "") or "",
model=getattr(self, "model", "") or "",
approx_tokens=approx_tokens,
context_length=_ctx_len,
num_messages=len(api_messages) if api_messages else 0,
)
logger.debug(
"Error classified: reason=%s status=%s retryable=%s compress=%s rotate=%s fallback=%s",
classified.reason.value, classified.status_code,
classified.retryable, classified.should_compress,
classified.should_rotate_credential, classified.should_fallback,
)
recovered_with_pool, has_retried_429 = self._recover_with_credential_pool(
status_code=status_code,
has_retried_429=has_retried_429,
classified_reason=classified.reason,
error_context=error_context,
)
if recovered_with_pool:
continue
# Image-too-large recovery: shrink oversized native image
# parts in-place and retry once. Triggered by Anthropic's
# per-image 5 MB ceiling (400 with "image exceeds 5 MB
# maximum") or any other provider that complains about
# image size. If shrink fails or a second attempt still
# fails, fall through to normal error handling.
if (
classified.reason == FailoverReason.image_too_large
and not image_shrink_retry_attempted
):
image_shrink_retry_attempted = True
if self._try_shrink_image_parts_in_messages(api_messages):
self._vprint(
f"{self.log_prefix}📐 Image(s) exceeded provider size limit — "
f"shrank and retrying...",
force=True,
)
continue
else:
logger.info(
"image-shrink recovery: no data-URL image parts found "
"or shrink didn't reduce size; surfacing original error."
)
# Anthropic OAuth subscription rejected the 1M-context beta
# header ("long context beta is not yet available for this
# subscription"). Disable the beta for the rest of this
# session, rebuild the client, and retry once. 1M-capable
# subscriptions never hit this branch — they accept the
# beta and keep full 1M context. See PR #17680 for the
# original report (we chose reactive recovery over the
# proposed unconditional omit so capable subscriptions
# don't silently lose the capability).
if (
classified.reason == FailoverReason.oauth_long_context_beta_forbidden
and self.api_mode == "anthropic_messages"
and self._is_anthropic_oauth
and not oauth_1m_beta_retry_attempted
):
oauth_1m_beta_retry_attempted = True
if not getattr(self, "_oauth_1m_beta_disabled", False):
self._oauth_1m_beta_disabled = True
try:
self._anthropic_client.close()
except Exception:
pass
self._rebuild_anthropic_client()
self._vprint(
f"{self.log_prefix}🔕 OAuth subscription doesn't support "
f"the 1M-context beta — disabled for this session and retrying...",
force=True,
)
continue
if (
self.api_mode == "codex_responses"
and self.provider == "openai-codex"
and status_code == 401
and not codex_auth_retry_attempted
):
codex_auth_retry_attempted = True
if self._try_refresh_codex_client_credentials(force=True):
self._vprint(f"{self.log_prefix}🔐 Codex auth refreshed after 401. Retrying request...")
continue
if (
self.api_mode == "chat_completions"
and self.provider == "nous"
and status_code == 401
and not nous_auth_retry_attempted
):
nous_auth_retry_attempted = True
if self._try_refresh_nous_client_credentials(force=True):
print(f"{self.log_prefix}🔐 Nous agent key refreshed after 401. Retrying request...")
continue
# Credential refresh didn't help — show diagnostic info.
# Most common causes: Portal OAuth expired/revoked,
# account out of credits, or agent key blocked.
from hermes_constants import display_hermes_home as _dhh_fn
_dhh = _dhh_fn()
_body_text = ""
try:
_body = getattr(api_error, "body", None) or getattr(api_error, "response", None)
if _body is not None:
_body_text = str(_body)[:200]
except Exception:
pass
print(f"{self.log_prefix}🔐 Nous 401 — Portal authentication failed.")
if _body_text:
print(f"{self.log_prefix} Response: {_body_text}")
print(f"{self.log_prefix} Most likely: Portal OAuth expired, account out of credits, or agent key revoked.")
print(f"{self.log_prefix} Troubleshooting:")
print(f"{self.log_prefix} • Re-authenticate: hermes login --provider nous")
print(f"{self.log_prefix} • Check credits / billing: https://portal.nousresearch.com")
print(f"{self.log_prefix} • Verify stored credentials: {_dhh}/auth.json")
print(f"{self.log_prefix} • Switch providers temporarily: /model <model> --provider openrouter")
if (
self.provider == "copilot"
and status_code == 401
and not copilot_auth_retry_attempted
):
copilot_auth_retry_attempted = True
if self._try_refresh_copilot_client_credentials():
self._vprint(f"{self.log_prefix}🔐 Copilot credentials refreshed after 401. Retrying request...")
continue
if (
self.api_mode == "anthropic_messages"
and status_code == 401
and hasattr(self, '_anthropic_api_key')
and not anthropic_auth_retry_attempted
):
anthropic_auth_retry_attempted = True
from agent.anthropic_adapter import _is_oauth_token
if self._try_refresh_anthropic_client_credentials():
print(f"{self.log_prefix}🔐 Anthropic credentials refreshed after 401. Retrying request...")
continue
# Credential refresh didn't help — show diagnostic info
key = self._anthropic_api_key
auth_method = "Bearer (OAuth/setup-token)" if _is_oauth_token(key) else "x-api-key (API key)"
print(f"{self.log_prefix}🔐 Anthropic 401 — authentication failed.")
print(f"{self.log_prefix} Auth method: {auth_method}")
print(f"{self.log_prefix} Token prefix: {key[:12]}..." if key and len(key) > 12 else f"{self.log_prefix} Token: (empty or short)")
print(f"{self.log_prefix} Troubleshooting:")
from hermes_constants import display_hermes_home as _dhh_fn
_dhh = _dhh_fn()
print(f"{self.log_prefix} • Check ANTHROPIC_TOKEN in {_dhh}/.env for Hermes-managed OAuth/setup tokens")
print(f"{self.log_prefix} • Check ANTHROPIC_API_KEY in {_dhh}/.env for API keys or legacy token values")
print(f"{self.log_prefix} • For API keys: verify at https://platform.claude.com/settings/keys")
print(f"{self.log_prefix} • For Claude Code: run 'claude /login' to refresh, then retry")
print(f"{self.log_prefix} • Legacy cleanup: hermes config set ANTHROPIC_TOKEN \"\"")
print(f"{self.log_prefix} • Clear stale keys: hermes config set ANTHROPIC_API_KEY \"\"")
# ── Thinking block signature recovery ─────────────────
# Anthropic signs thinking blocks against the full turn
# content. Any upstream mutation (context compression,
# session truncation, message merging) invalidates the
# signature → HTTP 400. Recovery: strip reasoning_details
# from all messages so the next retry sends no thinking
# blocks at all. One-shot — don't retry infinitely.
if (
classified.reason == FailoverReason.thinking_signature
and not thinking_sig_retry_attempted
):
thinking_sig_retry_attempted = True
for _m in messages:
if isinstance(_m, dict):
_m.pop("reasoning_details", None)
self._vprint(
f"{self.log_prefix}⚠️ Thinking block signature invalid — "
f"stripped all thinking blocks, retrying...",
force=True,
)
logging.warning(
"%sThinking block signature recovery: stripped "
"reasoning_details from %d messages",
self.log_prefix, len(messages),
)
continue
# ── llama.cpp grammar-parse recovery ──────────────────
# llama.cpp's ``json-schema-to-grammar`` converter rejects
# regex escape classes (``\d``, ``\w``, ``\s``) and most
# ``format`` values in tool schemas. MCP servers emit
# these routinely for date/phone/email params. Recovery:
# strip ``pattern``/``format`` from ``self.tools`` and
# retry once. We keep the keywords by default so cloud
# providers get the full prompting hints; this branch
# fires only for users on llama.cpp's OAI server.
if (
classified.reason == FailoverReason.llama_cpp_grammar_pattern
and not llama_cpp_grammar_retry_attempted
):
llama_cpp_grammar_retry_attempted = True
try:
from tools.schema_sanitizer import strip_pattern_and_format
_, _stripped = strip_pattern_and_format(self.tools)
except Exception as _strip_exc: # pragma: no cover — defensive
logging.warning(
"%sllama.cpp grammar recovery: strip helper failed: %s",
self.log_prefix, _strip_exc,
)
_stripped = 0
if _stripped:
self._vprint(
f"{self.log_prefix}⚠️ llama.cpp rejected tool schema grammar — "
f"stripped {_stripped} pattern/format keyword(s), retrying...",
force=True,
)
logging.warning(
"%sllama.cpp grammar recovery: stripped %d "
"pattern/format keyword(s) from tool schemas",
self.log_prefix, _stripped,
)
continue
# No keywords found to strip — fall through to normal
# retry path rather than loop forever on the same error.
logging.warning(
"%sllama.cpp grammar error but no pattern/format "
"keywords to strip — falling through to normal retry",
self.log_prefix,
)
retry_count += 1
elapsed_time = time.time() - api_start_time
self._touch_activity(
f"API error recovery (attempt {retry_count}/{max_retries})"
)
error_type = type(api_error).__name__
error_msg = str(api_error).lower()
_error_summary = self._summarize_api_error(api_error)
logger.warning(
"API call failed (attempt %s/%s) error_type=%s %s summary=%s",
retry_count,
max_retries,
error_type,
self._client_log_context(),
_error_summary,
)
_provider = getattr(self, "provider", "unknown")
_base = getattr(self, "base_url", "unknown")
_model = getattr(self, "model", "unknown")
_status_code_str = f" [HTTP {status_code}]" if status_code else ""
self._vprint(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}{_status_code_str}", force=True)
self._vprint(f"{self.log_prefix} 🔌 Provider: {_provider} Model: {_model}", force=True)
self._vprint(f"{self.log_prefix} 🌐 Endpoint: {_base}", force=True)
self._vprint(f"{self.log_prefix} 📝 Error: {_error_summary}", force=True)
if status_code and status_code < 500:
_err_body = getattr(api_error, "body", None)
_err_body_str = str(_err_body)[:300] if _err_body else None
if _err_body_str:
self._vprint(f"{self.log_prefix} 📋 Details: {_err_body_str}", force=True)
self._vprint(f"{self.log_prefix} ⏱️ Elapsed: {elapsed_time:.2f}s Context: {len(api_messages)} msgs, ~{approx_tokens:,} tokens")
# Actionable hint for OpenRouter "no tool endpoints" error.
# This fires regardless of whether fallback succeeds — the
# user needs to know WHY their model failed so they can fix
# their provider routing, not just silently fall back.
if (
self._is_openrouter_url()
and "support tool use" in error_msg
):
self._vprint(
f"{self.log_prefix} 💡 No OpenRouter providers for {_model} support tool calling with your current settings.",
force=True,
)
if self.providers_allowed:
self._vprint(
f"{self.log_prefix} Your provider_routing.only restriction is filtering out tool-capable providers.",
force=True,
)
self._vprint(
f"{self.log_prefix} Try removing the restriction or adding providers that support tools for this model.",
force=True,
)
self._vprint(
f"{self.log_prefix} Check which providers support tools: https://openrouter.ai/models/{_model}",
force=True,
)
# Check for interrupt before deciding to retry
if self._interrupt_requested:
self._vprint(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.", force=True)
self._persist_session(messages, conversation_history)
self.clear_interrupt()
return {
"final_response": f"Operation interrupted: handling API error ({error_type}: {self._clean_error_message(str(api_error))}).",
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"interrupted": True,
}
# Check for 413 payload-too-large BEFORE generic 4xx handler.
# A 413 is a payload-size error — the correct response is to
# compress history and retry, not abort immediately.
status_code = getattr(api_error, "status_code", None)
# ── Anthropic Sonnet long-context tier gate ───────────
# Anthropic returns HTTP 429 "Extra usage is required for
# long context requests" when a Claude Max (or similar)
# subscription doesn't include the 1M-context tier. This
# is NOT a transient rate limit — retrying or switching
# credentials won't help. Reduce context to 200k (the
# standard tier) and compress.
if classified.reason == FailoverReason.long_context_tier:
_reduced_ctx = 200000
compressor = self.context_compressor
old_ctx = compressor.context_length
if old_ctx > _reduced_ctx:
compressor.update_model(
model=self.model,
context_length=_reduced_ctx,
base_url=self.base_url,
api_key=getattr(self, "api_key", ""),
provider=self.provider,
)
# Context probing flags — only set on built-in
# compressor (plugin engines manage their own).
if hasattr(compressor, "_context_probed"):
compressor._context_probed = True
# Don't persist — this is a subscription-tier
# limitation, not a model capability. If the
# user later enables extra usage the 1M limit
# should come back automatically.
compressor._context_probe_persistable = False
self._vprint(
f"{self.log_prefix}⚠️ Anthropic long-context tier "
f"requires extra usage — reducing context: "
f"{old_ctx:,}{_reduced_ctx:,} tokens",
force=True,
)
compression_attempts += 1
if compression_attempts <= max_compression_attempts:
original_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message,
approx_tokens=approx_tokens,
task_id=effective_task_id,
)
# Compression created a new session — clear history
# so _flush_messages_to_session_db writes compressed
# messages to the new session, not skipping them.
conversation_history = None
if len(messages) < original_len or old_ctx > _reduced_ctx:
self._emit_status(
f"🗜️ Context reduced to {_reduced_ctx:,} tokens "
f"(was {old_ctx:,}), retrying..."
)
time.sleep(2)
restart_with_compressed_messages = True
break
# Fall through to normal error handling if compression
# is exhausted or didn't help.
# Eager fallback for rate-limit errors (429 or quota exhaustion).
# When a fallback model is configured, switch immediately instead
# of burning through retries with exponential backoff -- the
# primary provider won't recover within the retry window.
is_rate_limited = classified.reason in {
FailoverReason.rate_limit,
FailoverReason.billing,
}
if is_rate_limited and self._fallback_index < len(self._fallback_chain):
# Don't eagerly fallback if credential pool rotation may
# still recover. See _pool_may_recover_from_rate_limit
# for the single-credential-pool and CloudCode-quota
# exceptions. Fixes #11314 and #13636.
pool_may_recover = _pool_may_recover_from_rate_limit(
self._credential_pool,
provider=self.provider,
base_url=getattr(self, "base_url", None),
)
if not pool_may_recover:
self._emit_status("⚠️ Rate limited — switching to fallback provider...")
if self._try_activate_fallback(reason=classified.reason):
retry_count = 0
compression_attempts = 0
primary_recovery_attempted = False
continue
# ── Nous Portal: record rate limit & skip retries ─────
# When Nous returns a 429 that is a genuine account-
# level rate limit, record the reset time to a shared
# file so ALL sessions (cron, gateway, auxiliary) know
# not to pile on, then skip further retries -- each
# one burns another RPH request and deepens the hole.
# The retry loop's top-of-iteration guard will catch
# this on the next pass and try fallback or bail.
#
# IMPORTANT: Nous Portal multiplexes multiple upstream
# providers (DeepSeek, Kimi, MiMo, Hermes). A 429 can
# also mean an UPSTREAM provider is out of capacity
# for one specific model -- transient, clears in
# seconds, nothing to do with the caller's quota.
# Tripping the cross-session breaker on that would
# block every Nous model for minutes. We use
# ``is_genuine_nous_rate_limit`` to tell the two
# apart via the 429's own x-ratelimit-* headers and
# the last-known-good state captured on the previous
# successful response.
if (
is_rate_limited
and self.provider == "nous"
and classified.reason == FailoverReason.rate_limit
and not recovered_with_pool
):
_genuine_nous_rate_limit = False
try:
from agent.nous_rate_guard import (
is_genuine_nous_rate_limit,
record_nous_rate_limit,
)
_err_resp = getattr(api_error, "response", None)
_err_hdrs = (
getattr(_err_resp, "headers", None)
if _err_resp else None
)
_genuine_nous_rate_limit = is_genuine_nous_rate_limit(
headers=_err_hdrs,
last_known_state=self._rate_limit_state,
)
if _genuine_nous_rate_limit:
record_nous_rate_limit(
headers=_err_hdrs,
error_context=error_context,
)
else:
logging.info(
"Nous 429 looks like upstream capacity "
"(no exhausted bucket in headers or "
"last-known state) -- not tripping "
"cross-session breaker."
)
except Exception:
pass
if _genuine_nous_rate_limit:
# Skip straight to max_retries -- the
# top-of-loop guard will handle fallback or
# bail cleanly.
retry_count = max_retries
continue
# Upstream capacity 429: fall through to normal
# retry logic. A different model (or the same
# model a moment later) will typically succeed.
is_payload_too_large = (
classified.reason == FailoverReason.payload_too_large
)
if is_payload_too_large:
compression_attempts += 1
if compression_attempts > max_compression_attempts:
self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True)
self._vprint(f"{self.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
logging.error(f"{self.log_prefix}413 compression failed after {max_compression_attempts} attempts.")
self._persist_session(messages, conversation_history)
return {
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.",
"partial": True,
"failed": True,
"compression_exhausted": True,
}
self._emit_status(f"⚠️ Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...")
original_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message, approx_tokens=approx_tokens,
task_id=effective_task_id,
)
# Compression created a new session — clear history
# so _flush_messages_to_session_db writes compressed
# messages to the new session, not skipping them.
conversation_history = None
if len(messages) < original_len:
self._emit_status(f"🗜️ Compressed {original_len}{len(messages)} messages, retrying...")
time.sleep(2) # Brief pause between compression retries
restart_with_compressed_messages = True
break
else:
self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.", force=True)
self._vprint(f"{self.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.")
self._persist_session(messages, conversation_history)
return {
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": "Request payload too large (413). Cannot compress further.",
"partial": True,
"failed": True,
"compression_exhausted": True,
}
# Check for context-length errors BEFORE generic 4xx handler.
# The classifier detects context overflow from: explicit error
# messages, generic 400 + large session heuristic (#1630), and
# server disconnect + large session pattern (#2153).
is_context_length_error = (
classified.reason == FailoverReason.context_overflow
)
if is_context_length_error:
compressor = self.context_compressor
old_ctx = compressor.context_length
# ── Distinguish two very different errors ───────────
# 1. "Prompt too long": the INPUT exceeds the context window.
# Fix: reduce context_length + compress history.
# 2. "max_tokens too large": input is fine, but
# input_tokens + requested max_tokens > context_window.
# Fix: reduce max_tokens (the OUTPUT cap) for this call.
# Do NOT shrink context_length — the window is unchanged.
#
# Note: max_tokens = output token cap (one response).
# context_length = total window (input + output combined).
available_out = parse_available_output_tokens_from_error(error_msg)
if available_out is not None:
# Error is purely about the output cap being too large.
# Cap output to the available space and retry without
# touching context_length or triggering compression.
safe_out = max(1, available_out - 64) # small safety margin
self._ephemeral_max_output_tokens = safe_out
self._vprint(
f"{self.log_prefix}⚠️ Output cap too large for current prompt — "
f"retrying with max_tokens={safe_out:,} "
f"(available_tokens={available_out:,}; context_length unchanged at {old_ctx:,})",
force=True,
)
# Still count against compression_attempts so we don't
# loop forever if the error keeps recurring.
compression_attempts += 1
if compression_attempts > max_compression_attempts:
self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
self._vprint(f"{self.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
self._persist_session(messages, conversation_history)
return {
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.",
"partial": True,
"failed": True,
"compression_exhausted": True,
}
restart_with_compressed_messages = True
break
# Error is about the INPUT being too large — reduce context_length.
# Try to parse the actual limit from the error message
parsed_limit = parse_context_limit_from_error(error_msg)
_provider_lower = (getattr(self, "provider", "") or "").lower()
_base_lower = (getattr(self, "base_url", "") or "").rstrip("/").lower()
is_minimax_provider = (
_provider_lower in {"minimax", "minimax-cn"}
or _base_lower.startswith((
"https://api.minimax.io/anthropic",
"https://api.minimaxi.com/anthropic",
))
)
minimax_delta_only_overflow = (
is_minimax_provider
and parsed_limit is None
and "context window exceeds limit (" in error_msg
)
if parsed_limit and parsed_limit < old_ctx:
new_ctx = parsed_limit
self._vprint(f"{self.log_prefix}Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True)
elif minimax_delta_only_overflow:
new_ctx = old_ctx
self._vprint(
f"{self.log_prefix}Provider reported overflow amount only; "
f"keeping context_length at {old_ctx:,} tokens and compressing.",
force=True,
)
else:
# Step down to the next probe tier
new_ctx = get_next_probe_tier(old_ctx)
if new_ctx and new_ctx < old_ctx:
compressor.update_model(
model=self.model,
context_length=new_ctx,
base_url=self.base_url,
api_key=getattr(self, "api_key", ""),
provider=self.provider,
)
# Context probing flags — only set on built-in
# compressor (plugin engines manage their own).
if hasattr(compressor, "_context_probed"):
compressor._context_probed = True
# Only persist limits parsed from the provider's
# error message (a real number). Guessed fallback
# tiers from get_next_probe_tier() should stay
# in-memory only — persisting them pollutes the
# cache with wrong values.
compressor._context_probe_persistable = bool(
parsed_limit and parsed_limit == new_ctx
)
self._vprint(f"{self.log_prefix}⚠️ Context length exceeded — stepping down: {old_ctx:,}{new_ctx:,} tokens", force=True)
else:
self._vprint(f"{self.log_prefix}⚠️ Context length exceeded at minimum tier — attempting compression...", force=True)
compression_attempts += 1
if compression_attempts > max_compression_attempts:
self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
self._vprint(f"{self.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
self._persist_session(messages, conversation_history)
return {
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.",
"partial": True,
"failed": True,
"compression_exhausted": True,
}
self._emit_status(f"🗜️ Context too large (~{approx_tokens:,} tokens) — compressing ({compression_attempts}/{max_compression_attempts})...")
original_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message, approx_tokens=approx_tokens,
task_id=effective_task_id,
)
# Compression created a new session — clear history
# so _flush_messages_to_session_db writes compressed
# messages to the new session, not skipping them.
conversation_history = None
if len(messages) < original_len or new_ctx and new_ctx < old_ctx:
if len(messages) < original_len:
self._emit_status(f"🗜️ Compressed {original_len}{len(messages)} messages, retrying...")
time.sleep(2) # Brief pause between compression retries
restart_with_compressed_messages = True
break
else:
# Can't compress further and already at minimum tier
self._vprint(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.", force=True)
self._vprint(f"{self.log_prefix} 💡 The conversation has accumulated too much content. Try /new to start fresh, or /compress to manually trigger compression.", force=True)
logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.")
self._persist_session(messages, conversation_history)
return {
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.",
"partial": True,
"failed": True,
"compression_exhausted": True,
}
# Check for non-retryable client errors. The classifier
# already accounts for 413, 429, 529 (transient), context
# overflow, and generic-400 heuristics. Local validation
# errors (ValueError, TypeError) are programming bugs.
# Exclude UnicodeEncodeError — it's a ValueError subclass
# but is handled separately by the surrogate sanitization
# path above. Exclude json.JSONDecodeError — also a
# ValueError subclass, but it indicates a transient
# provider/network failure (malformed response body,
# truncated stream, routing layer corruption), not a
# local programming bug, and should be retried (#14782).
is_local_validation_error = (
isinstance(api_error, (ValueError, TypeError))
and not isinstance(
api_error, (UnicodeEncodeError, json.JSONDecodeError)
)
# ssl.SSLError (and its subclass SSLCertVerificationError)
# inherits from OSError *and* ValueError via Python MRO,
# so the isinstance(ValueError) check above would
# misclassify a TLS transport failure as a local
# programming bug and abort without retrying. Exclude
# ssl.SSLError explicitly so the error classifier's
# retryable=True mapping takes effect instead.
and not isinstance(api_error, ssl.SSLError)
)
is_client_error = (
is_local_validation_error
or (
not classified.retryable
and not classified.should_compress
and classified.reason not in {
FailoverReason.rate_limit,
FailoverReason.billing,
FailoverReason.overloaded,
FailoverReason.context_overflow,
FailoverReason.payload_too_large,
FailoverReason.long_context_tier,
FailoverReason.thinking_signature,
}
)
) and not is_context_length_error
if is_client_error:
# Try fallback before aborting — a different provider
# may not have the same issue (rate limit, auth, etc.)
self._emit_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
if self._try_activate_fallback():
retry_count = 0
compression_attempts = 0
primary_recovery_attempted = False
continue
if api_kwargs is not None:
self._dump_api_request_debug(
api_kwargs, reason="non_retryable_client_error", error=api_error,
)
self._emit_status(
f"❌ Non-retryable error (HTTP {status_code}): "
f"{self._summarize_api_error(api_error)}"
)
self._vprint(f"{self.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True)
self._vprint(f"{self.log_prefix} 🔌 Provider: {_provider} Model: {_model}", force=True)
self._vprint(f"{self.log_prefix} 🌐 Endpoint: {_base}", force=True)
# Actionable guidance for common auth errors
if classified.is_auth or classified.reason == FailoverReason.billing:
if _provider == "openai-codex" and status_code == 401:
self._vprint(f"{self.log_prefix} 💡 Codex OAuth token was rejected (HTTP 401). Your token may have been", force=True)
self._vprint(f"{self.log_prefix} refreshed by another client (Codex CLI, VS Code). To fix:", force=True)
self._vprint(f"{self.log_prefix} 1. Run `codex` in your terminal to generate fresh tokens.", force=True)
self._vprint(f"{self.log_prefix} 2. Then run `hermes auth` to re-authenticate.", force=True)
else:
self._vprint(f"{self.log_prefix} 💡 Your API key was rejected by the provider. Check:", force=True)
self._vprint(f"{self.log_prefix} • Is the key valid? Run: hermes setup", force=True)
self._vprint(f"{self.log_prefix} • Does your account have access to {_model}?", force=True)
if base_url_host_matches(str(_base), "openrouter.ai"):
self._vprint(f"{self.log_prefix} • Check credits: https://openrouter.ai/settings/credits", force=True)
else:
self._vprint(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.", force=True)
logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}")
# Skip session persistence when the error is likely
# context-overflow related (status 400 + large session).
# Persisting the failed user message would make the
# session even larger, causing the same failure on the
# next attempt. (#1630)
if status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80):
self._vprint(
f"{self.log_prefix}⚠️ Skipping session persistence "
f"for large failed session to prevent growth loop.",
force=True,
)
else:
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"failed": True,
"error": str(api_error),
}
if retry_count >= max_retries:
# Before falling back, try rebuilding the primary
# client once for transient transport errors (stale
# connection pool, TCP reset). Only attempted once
# per API call block.
if not primary_recovery_attempted and self._try_recover_primary_transport(
api_error, retry_count=retry_count, max_retries=max_retries,
):
primary_recovery_attempted = True
retry_count = 0
continue
# Try fallback before giving up entirely
self._emit_status(f"⚠️ Max retries ({max_retries}) exhausted — trying fallback...")
if self._try_activate_fallback():
retry_count = 0
compression_attempts = 0
primary_recovery_attempted = False
continue
_final_summary = self._summarize_api_error(api_error)
if is_rate_limited:
self._emit_status(f"❌ Rate limited after {max_retries} retries — {_final_summary}")
else:
self._emit_status(f"❌ API failed after {max_retries} retries — {_final_summary}")
self._vprint(f"{self.log_prefix} 💀 Final error: {_final_summary}", force=True)
# Detect SSE stream-drop pattern (e.g. "Network
# connection lost") and surface actionable guidance.
# This typically happens when the model generates a
# very large tool call (write_file with huge content)
# and the proxy/CDN drops the stream mid-response.
_is_stream_drop = (
not getattr(api_error, "status_code", None)
and any(p in error_msg for p in (
"connection lost", "connection reset",
"connection closed", "network connection",
"network error", "terminated",
))
)
if _is_stream_drop:
self._vprint(
f"{self.log_prefix} 💡 The provider's stream "
f"connection keeps dropping. This often happens "
f"when the model tries to write a very large "
f"file in a single tool call.",
force=True,
)
self._vprint(
f"{self.log_prefix} Try asking the model "
f"to use execute_code with Python's open() for "
f"large files, or to write the file in smaller "
f"sections.",
force=True,
)
logging.error(
"%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s",
self.log_prefix, max_retries, _final_summary,
_provider, _model, len(api_messages), f"{approx_tokens:,}",
)
if api_kwargs is not None:
self._dump_api_request_debug(
api_kwargs, reason="max_retries_exhausted", error=api_error,
)
self._persist_session(messages, conversation_history)
_final_response = f"API call failed after {max_retries} retries: {_final_summary}"
if _is_stream_drop:
_final_response += (
"\n\nThe provider's stream connection keeps "
"dropping — this often happens when generating "
"very large tool call responses (e.g. write_file "
"with long content). Try asking me to use "
"execute_code with Python's open() for large "
"files, or to write in smaller sections."
)
return {
"final_response": _final_response,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"failed": True,
"error": _final_summary,
}
# For rate limits, respect the Retry-After header if present
_retry_after = None
if is_rate_limited:
_resp_headers = getattr(getattr(api_error, "response", None), "headers", None)
if _resp_headers and hasattr(_resp_headers, "get"):
_ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
if _ra_raw:
try:
_retry_after = min(float(_ra_raw), 120) # Cap at 2 minutes
except (TypeError, ValueError):
pass
wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
if is_rate_limited:
self._emit_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...")
else:
self._emit_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...")
logger.warning(
"Retrying API call in %ss (attempt %s/%s) %s error=%s",
wait_time,
retry_count,
max_retries,
self._client_log_context(),
api_error,
)
# Sleep in small increments so we can respond to interrupts quickly
# instead of blocking the entire wait_time in one sleep() call
sleep_end = time.time() + wait_time
_backoff_touch_counter = 0
while time.time() < sleep_end:
if self._interrupt_requested:
self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
self._persist_session(messages, conversation_history)
self.clear_interrupt()
return {
"final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).",
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"interrupted": True,
}
time.sleep(0.2) # Check interrupt every 200ms
# Touch activity every ~30s so the gateway's inactivity
# monitor knows we're alive during backoff waits.
_backoff_touch_counter += 1
if _backoff_touch_counter % 150 == 0: # 150 × 0.2s = 30s
self._touch_activity(
f"error retry backoff ({retry_count}/{max_retries}), "
f"{int(sleep_end - time.time())}s remaining"
)
# If the API call was interrupted, skip response processing
if interrupted:
_turn_exit_reason = "interrupted_during_api_call"
break
if restart_with_compressed_messages:
api_call_count -= 1
self.iteration_budget.refund()
# Count compression restarts toward the retry limit to prevent
# infinite loops when compression reduces messages but not enough
# to fit the context window.
retry_count += 1
restart_with_compressed_messages = False
continue
if restart_with_length_continuation:
# Progressively boost the output token budget on each retry.
# Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
# Applies to all providers via _ephemeral_max_output_tokens.
_boost_base = self.max_tokens if self.max_tokens else 4096
_boost = _boost_base * (length_continue_retries + 1)
self._ephemeral_max_output_tokens = min(_boost, 32768)
continue
# Guard: if all retries exhausted without a successful response
# (e.g. repeated context-length errors that exhausted retry_count),
# the `response` variable is still None. Break out cleanly.
if response is None:
_turn_exit_reason = "all_retries_exhausted_no_response"
print(f"{self.log_prefix}❌ All API retries exhausted with no successful response.")
self._persist_session(messages, conversation_history)
break
try:
_transport = self._get_transport()
_normalize_kwargs = {}
if self.api_mode == "anthropic_messages":
_normalize_kwargs["strip_tool_prefix"] = self._is_anthropic_oauth
normalized = _transport.normalize_response(response, **_normalize_kwargs)
assistant_message = normalized
finish_reason = normalized.finish_reason
# Normalize content to string — some OpenAI-compatible servers
# (llama-server, etc.) return content as a dict or list instead
# of a plain string, which crashes downstream .strip() calls.
if assistant_message.content is not None and not isinstance(assistant_message.content, str):
raw = assistant_message.content
if isinstance(raw, dict):
assistant_message.content = raw.get("text", "") or raw.get("content", "") or json.dumps(raw)
elif isinstance(raw, list):
# Multimodal content list — extract text parts
parts = []
for part in raw:
if isinstance(part, str):
parts.append(part)
elif isinstance(part, dict) and part.get("type") == "text":
parts.append(part.get("text", ""))
elif isinstance(part, dict) and "text" in part:
parts.append(str(part["text"]))
assistant_message.content = "\n".join(parts)
else:
assistant_message.content = str(raw)
try:
from hermes_cli.plugins import invoke_hook as _invoke_hook
_assistant_tool_calls = getattr(assistant_message, "tool_calls", None) or []
_assistant_text = assistant_message.content or ""
_invoke_hook(
"post_api_request",
task_id=effective_task_id,
session_id=self.session_id or "",
platform=self.platform or "",
model=self.model,
provider=self.provider,
base_url=self.base_url,
api_mode=self.api_mode,
api_call_count=api_call_count,
api_duration=api_duration,
finish_reason=finish_reason,
message_count=len(api_messages),
response_model=getattr(response, "model", None),
usage=self._usage_summary_for_api_request_hook(response),
assistant_content_chars=len(_assistant_text),
assistant_tool_call_count=len(_assistant_tool_calls),
)
except Exception:
pass
# Handle assistant response
if assistant_message.content and not self.quiet_mode:
if self.verbose_logging:
self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content}")
else:
self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
# Notify progress callback of model's thinking (used by subagent
# delegation to relay the child's reasoning to the parent display).
if (assistant_message.content and self.tool_progress_callback):
_think_text = assistant_message.content.strip()
# Strip reasoning XML tags that shouldn't leak to parent display
_think_text = re.sub(
r'</?(?:REASONING_SCRATCHPAD|think|reasoning)>', '', _think_text
).strip()
# For subagents: relay first line to parent display (existing behaviour).
# For all agents with a structured callback: emit reasoning.available event.
first_line = _think_text.split('\n')[0][:80] if _think_text else ""
if first_line and getattr(self, '_delegate_depth', 0) > 0:
try:
self.tool_progress_callback("_thinking", first_line)
except Exception:
pass
elif _think_text:
try:
self.tool_progress_callback("reasoning.available", "_thinking", _think_text[:500], None)
except Exception:
pass
# Check for incomplete <REASONING_SCRATCHPAD> (opened but never closed)
# This means the model ran out of output tokens mid-reasoning — retry up to 2 times
if has_incomplete_scratchpad(assistant_message.content or ""):
self._incomplete_scratchpad_retries += 1
self._vprint(f"{self.log_prefix}⚠️ Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
if self._incomplete_scratchpad_retries <= 2:
self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...")
# Don't add the broken message, just retry
continue
else:
# Max retries - discard this turn and save as partial
self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True)
self._incomplete_scratchpad_retries = 0
rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
self._cleanup_task_resources(effective_task_id)
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": rolled_back_messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": "Incomplete REASONING_SCRATCHPAD after 2 retries"
}
# Reset incomplete scratchpad counter on clean response
self._incomplete_scratchpad_retries = 0
if self.api_mode == "codex_responses" and finish_reason == "incomplete":
self._codex_incomplete_retries += 1
interim_msg = self._build_assistant_message(assistant_message, finish_reason)
interim_has_content = bool((interim_msg.get("content") or "").strip())
interim_has_reasoning = bool(interim_msg.get("reasoning", "").strip()) if isinstance(interim_msg.get("reasoning"), str) else False
interim_has_codex_reasoning = bool(interim_msg.get("codex_reasoning_items"))
interim_has_codex_message_items = bool(interim_msg.get("codex_message_items"))
if (
interim_has_content
or interim_has_reasoning
or interim_has_codex_reasoning
or interim_has_codex_message_items
):
last_msg = messages[-1] if messages else None
# Duplicate detection: two consecutive incomplete assistant
# messages with identical content AND reasoning are collapsed.
# For provider-state-only changes (encrypted reasoning
# items or replayable message ids/phases/statuses differ
# while visible content/reasoning are unchanged), compare
# those opaque payloads too so we don't silently drop the
# newer continuation state.
last_codex_items = last_msg.get("codex_reasoning_items") if isinstance(last_msg, dict) else None
interim_codex_items = interim_msg.get("codex_reasoning_items")
last_codex_message_items = last_msg.get("codex_message_items") if isinstance(last_msg, dict) else None
interim_codex_message_items = interim_msg.get("codex_message_items")
duplicate_interim = (
isinstance(last_msg, dict)
and last_msg.get("role") == "assistant"
and last_msg.get("finish_reason") == "incomplete"
and (last_msg.get("content") or "") == (interim_msg.get("content") or "")
and (last_msg.get("reasoning") or "") == (interim_msg.get("reasoning") or "")
and last_codex_items == interim_codex_items
and last_codex_message_items == interim_codex_message_items
)
if not duplicate_interim:
messages.append(interim_msg)
self._emit_interim_assistant_message(interim_msg)
if self._codex_incomplete_retries < 3:
if not self.quiet_mode:
self._vprint(f"{self.log_prefix}↻ Codex response incomplete; continuing turn ({self._codex_incomplete_retries}/3)")
self._session_messages = messages
self._save_session_log(messages)
continue
self._codex_incomplete_retries = 0
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": "Codex response remained incomplete after 3 continuation attempts",
}
elif hasattr(self, "_codex_incomplete_retries"):
self._codex_incomplete_retries = 0
# Check for tool calls
if assistant_message.tool_calls:
if not self.quiet_mode:
self._vprint(f"{self.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...")
if self.verbose_logging:
for tc in assistant_message.tool_calls:
logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...")
# Validate tool call names - detect model hallucinations
# Repair mismatched tool names before validating
for tc in assistant_message.tool_calls:
if tc.function.name not in self.valid_tool_names:
repaired = self._repair_tool_call(tc.function.name)
if repaired:
print(f"{self.log_prefix}🔧 Auto-repaired tool name: '{tc.function.name}' -> '{repaired}'")
tc.function.name = repaired
invalid_tool_calls = [
tc.function.name for tc in assistant_message.tool_calls
if tc.function.name not in self.valid_tool_names
]
if invalid_tool_calls:
# Track retries for invalid tool calls
self._invalid_tool_retries += 1
# Return helpful error to model — model can self-correct next turn
available = ", ".join(sorted(self.valid_tool_names))
invalid_name = invalid_tool_calls[0]
invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name
self._vprint(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for self-correction ({self._invalid_tool_retries}/3)")
if self._invalid_tool_retries >= 3:
self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True)
self._invalid_tool_retries = 0
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": f"Model generated invalid tool call: {invalid_preview}"
}
assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
messages.append(assistant_msg)
for tc in assistant_message.tool_calls:
if tc.function.name not in self.valid_tool_names:
content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}"
else:
content = "Skipped: another tool call in this turn used an invalid name. Please retry this tool call."
messages.append({
"role": "tool",
"name": tc.function.name,
"tool_call_id": tc.id,
"content": content,
})
continue
# Reset retry counter on successful tool call validation
self._invalid_tool_retries = 0
# Validate tool call arguments are valid JSON
# Handle empty strings as empty objects (common model quirk)
invalid_json_args = []
for tc in assistant_message.tool_calls:
args = tc.function.arguments
if isinstance(args, (dict, list)):
tc.function.arguments = json.dumps(args)
continue
if args is not None and not isinstance(args, str):
tc.function.arguments = str(args)
args = tc.function.arguments
# Treat empty/whitespace strings as empty object
if not args or not args.strip():
tc.function.arguments = "{}"
continue
try:
json.loads(args)
except json.JSONDecodeError as e:
invalid_json_args.append((tc.function.name, str(e)))
if invalid_json_args:
# Check if the invalid JSON is due to truncation rather
# than a model formatting mistake. Routers sometimes
# rewrite finish_reason from "length" to "tool_calls",
# hiding the truncation from the length handler above.
# Detect truncation: args that don't end with } or ]
# (after stripping whitespace) are cut off mid-stream.
_truncated = any(
not (tc.function.arguments or "").rstrip().endswith(("}", "]"))
for tc in assistant_message.tool_calls
if tc.function.name in {n for n, _ in invalid_json_args}
)
if _truncated:
self._vprint(
f"{self.log_prefix}⚠️ Truncated tool call arguments detected "
f"(finish_reason={finish_reason!r}) — refusing to execute.",
force=True,
)
self._invalid_json_retries = 0
self._cleanup_task_resources(effective_task_id)
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": "Response truncated due to output length limit",
}
# Track retries for invalid JSON arguments
self._invalid_json_retries += 1
tool_name, error_msg = invalid_json_args[0]
self._vprint(f"{self.log_prefix}⚠️ Invalid JSON in tool call arguments for '{tool_name}': {error_msg}")
if self._invalid_json_retries < 3:
self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_json_retries}/3)...")
# Don't add anything to messages, just retry the API call
continue
else:
# Instead of returning partial, inject tool error results so the model can recover.
# Using tool results (not user messages) preserves role alternation.
self._vprint(f"{self.log_prefix}⚠️ Injecting recovery tool results for invalid JSON...")
self._invalid_json_retries = 0 # Reset for next attempt
# Append the assistant message with its (broken) tool_calls
recovery_assistant = self._build_assistant_message(assistant_message, finish_reason)
messages.append(recovery_assistant)
# Respond with tool error results for each tool call
invalid_names = {name for name, _ in invalid_json_args}
for tc in assistant_message.tool_calls:
if tc.function.name in invalid_names:
err = next(e for n, e in invalid_json_args if n == tc.function.name)
tool_result = (
f"Error: Invalid JSON arguments. {err}. "
f"For tools with no required parameters, use an empty object: {{}}. "
f"Please retry with valid JSON."
)
else:
tool_result = "Skipped: other tool call in this response had invalid JSON."
messages.append({
"role": "tool",
"name": tc.function.name,
"tool_call_id": tc.id,
"content": tool_result,
})
continue
# Reset retry counter on successful JSON validation
self._invalid_json_retries = 0
# ── Post-call guardrails ──────────────────────────
assistant_message.tool_calls = self._cap_delegate_task_calls(
assistant_message.tool_calls
)
assistant_message.tool_calls = self._deduplicate_tool_calls(
assistant_message.tool_calls
)
assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
# If this turn has both content AND tool_calls, capture the content
# as a fallback final response. Common pattern: model delivers its
# answer and calls memory/skill tools as a side-effect in the same
# turn. If the follow-up turn after tools is empty, we use this.
turn_content = assistant_message.content or ""
if turn_content and self._has_content_after_think_block(turn_content):
self._last_content_with_tools = turn_content
# Only mute subsequent output when EVERY tool call in
# this turn is post-response housekeeping (memory, todo,
# skill_manage, etc.). If any substantive tool is present
# (search_files, read_file, write_file, terminal, ...),
# keep output visible so the user sees progress.
_HOUSEKEEPING_TOOLS = frozenset({
"memory", "todo", "skill_manage", "session_search",
})
_all_housekeeping = all(
tc.function.name in _HOUSEKEEPING_TOOLS
for tc in assistant_message.tool_calls
)
self._last_content_tools_all_housekeeping = _all_housekeeping
if _all_housekeeping and self._has_stream_consumers():
self._mute_post_response = True
elif self._should_emit_quiet_tool_messages():
clean = self._strip_think_blocks(turn_content).strip()
if clean:
self._vprint(f" ┊ 💬 {clean}")
# Pop thinking-only prefill message(s) before appending
# (tool-call path — same rationale as the final-response path).
_had_prefill = False
while (
messages
and isinstance(messages[-1], dict)
and messages[-1].get("_thinking_prefill")
):
messages.pop()
_had_prefill = True
# Reset prefill counter when tool calls follow a prefill
# recovery. Without this, the counter accumulates across
# the whole conversation — a model that intermittently
# empties (empty → prefill → tools → empty → prefill →
# tools) burns both prefill attempts and the third empty
# gets zero recovery. Resetting here treats each tool-
# call success as a fresh start.
if _had_prefill:
self._thinking_prefill_retries = 0
self._empty_content_retries = 0
# Successful tool execution — reset the post-tool nudge
# flag so it can fire again if the model goes empty on
# a LATER tool round.
self._post_tool_empty_retried = False
messages.append(assistant_msg)
self._emit_interim_assistant_message(assistant_msg)
# Close any open streaming display (response box, reasoning
# box) before tool execution begins. Intermediate turns may
# have streamed early content that opened the response box;
# flushing here prevents it from wrapping tool feed lines.
# Only signal the display callback — TTS (_stream_callback)
# should NOT receive None (it uses None as end-of-stream).
if self.stream_delta_callback:
try:
self.stream_delta_callback(None)
except Exception:
pass
self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)
if self._tool_guardrail_halt_decision is not None:
decision = self._tool_guardrail_halt_decision
_turn_exit_reason = "guardrail_halt"
final_response = self._toolguard_controlled_halt_response(decision)
self._emit_status(
f"⚠️ Tool guardrail halted {decision.tool_name}: {decision.code}"
)
messages.append({"role": "assistant", "content": final_response})
break
# Reset per-turn retry counters after successful tool
# execution so a single truncation doesn't poison the
# entire conversation.
truncated_tool_call_retries = 0
# Signal that a paragraph break is needed before the next
# streamed text. We don't emit it immediately because
# multiple consecutive tool iterations would stack up
# redundant blank lines. Instead, _fire_stream_delta()
# will prepend a single "\n\n" the next time real text
# arrives.
self._stream_needs_break = True
# Refund the iteration if the ONLY tool(s) called were
# execute_code (programmatic tool calling). These are
# cheap RPC-style calls that shouldn't eat the budget.
_tc_names = {tc.function.name for tc in assistant_message.tool_calls}
if _tc_names == {"execute_code"}:
self.iteration_budget.refund()
# Use real token counts from the API response to decide
# compression. prompt_tokens + completion_tokens is the
# actual context size the provider reported plus the
# assistant turn — a tight lower bound for the next prompt.
# Tool results appended above aren't counted yet, but the
# threshold (default 50%) leaves ample headroom; if tool
# results push past it, the next API call will report the
# real total and trigger compression then.
#
# If last_prompt_tokens is 0 (stale after API disconnect
# or provider returned no usage data), fall back to rough
# estimate to avoid missing compression. Without this,
# a session can grow unbounded after disconnects because
# should_compress(0) never fires. (#2153)
_compressor = self.context_compressor
if _compressor.last_prompt_tokens > 0:
# Only use prompt_tokens — completion/reasoning
# tokens don't consume context window space.
# Thinking models (GLM-5.1, QwQ, DeepSeek R1)
# inflate completion_tokens with reasoning,
# causing premature compression. (#12026)
_real_tokens = _compressor.last_prompt_tokens
else:
# Include tool schemas — with 50+ tools enabled
# these add 20-30K tokens the messages-only
# estimate misses, which can skip compression
# past the configured threshold (#14695).
_real_tokens = estimate_request_tokens_rough(
messages, tools=self.tools or None
)
if self.compression_enabled and _compressor.should_compress(_real_tokens):
self._safe_print(" ⟳ compacting context…")
messages, active_system_prompt = self._compress_context(
messages, system_message,
approx_tokens=self.context_compressor.last_prompt_tokens,
task_id=effective_task_id,
)
# Compression created a new session — clear history so
# _flush_messages_to_session_db writes compressed messages
# to the new session (see preflight compression comment).
conversation_history = None
# Save session log incrementally (so progress is visible even if interrupted)
self._session_messages = messages
self._save_session_log(messages)
# Continue loop for next response
continue
else:
# No tool calls - this is the final response
final_response = assistant_message.content or ""
# Fix: unmute output when entering the no-tool-call branch
# so the user can see empty-response warnings and recovery
# status messages. _mute_post_response was set during a
# prior housekeeping tool turn and should not silence the
# final response path.
self._mute_post_response = False
# Check if response only has think block with no actual content after it
if not self._has_content_after_think_block(final_response):
# ── Partial stream recovery ─────────────────────
# If content was already streamed to the user before
# the connection died, use it as the final response
# instead of falling through to prior-turn fallback
# or wasting API calls on retries.
_partial_streamed = (
getattr(self, "_current_streamed_assistant_text", "") or ""
)
if self._has_content_after_think_block(_partial_streamed):
_turn_exit_reason = "partial_stream_recovery"
_recovered = self._strip_think_blocks(_partial_streamed).strip()
logger.info(
"Partial stream content delivered (%d chars) "
"— using as final response",
len(_recovered),
)
self._emit_status(
"↻ Stream interrupted — using delivered content "
"as final response"
)
final_response = _recovered
self._response_was_previewed = True
break
# If the previous turn already delivered real content alongside
# HOUSEKEEPING tool calls (e.g. "You're welcome!" + memory save),
# the model has nothing more to say. Use the earlier content
# immediately instead of wasting API calls on retries.
# NOTE: Only use this shortcut when ALL tools in that turn were
# housekeeping (memory, todo, etc.). When substantive tools
# were called (terminal, search_files, etc.), the content was
# likely mid-task narration ("I'll scan the directory...") and
# the empty follow-up means the model choked — let the
# post-tool nudge below handle that instead of exiting early.
fallback = getattr(self, '_last_content_with_tools', None)
if fallback and getattr(self, '_last_content_tools_all_housekeeping', False):
_turn_exit_reason = "fallback_prior_turn_content"
logger.info("Empty follow-up after tool calls — using prior turn content as final response")
self._emit_status("↻ Empty response after tool calls — using earlier content as final answer")
self._last_content_with_tools = None
self._last_content_tools_all_housekeeping = False
self._empty_content_retries = 0
# Do NOT modify the assistant message content — the
# old code injected "Calling the X tools..." which
# poisoned the conversation history. Just use the
# fallback text as the final response and break.
final_response = self._strip_think_blocks(fallback).strip()
self._response_was_previewed = True
break
# ── Post-tool-call empty response nudge ───────────
# The model returned empty after executing tool calls.
# This covers two cases:
# (a) No prior-turn content at all — model went silent
# (b) Prior turn had content + SUBSTANTIVE tools (the
# fallback above was skipped because the content
# was mid-task narration, not a final answer)
# Instead of giving up, nudge the model to continue by
# appending a user-level hint. This is the #9400 case:
# weaker models (mimo-v2-pro, GLM-5, etc.) sometimes
# return empty after tool results instead of continuing
# to the next step. One retry with a nudge usually
# fixes it.
_prior_was_tool = any(
m.get("role") == "tool"
for m in messages[-5:] # check recent messages
)
# Detect Qwen3/Ollama-style in-content thinking blocks.
# Ollama puts <think> in the content field (not in
# reasoning_content), so _has_structured below would
# miss it. We check here so thinking-only responses
# after tool calls route to prefill instead of nudge.
_has_inline_thinking = bool(
re.search(
r'<think>|<thinking>|<reasoning>',
final_response or "",
re.IGNORECASE,
)
)
if (
_prior_was_tool
and not getattr(self, "_post_tool_empty_retried", False)
and not _has_inline_thinking # thinking model still working — let prefill handle
):
self._post_tool_empty_retried = True
# Clear stale narration so it doesn't resurface
# on a later empty response after the nudge.
self._last_content_with_tools = None
self._last_content_tools_all_housekeeping = False
logger.info(
"Empty response after tool calls — nudging model "
"to continue processing"
)
self._emit_status(
"⚠️ Model returned empty after tool calls — "
"nudging to continue"
)
# Append the empty assistant message first so the
# message sequence stays valid:
# tool(result) → assistant("(empty)") → user(nudge)
# Without this, we'd have tool → user which most
# APIs reject as an invalid sequence.
_nudge_msg = self._build_assistant_message(assistant_message, finish_reason)
_nudge_msg["content"] = "(empty)"
_nudge_msg["_empty_recovery_synthetic"] = True
messages.append(_nudge_msg)
messages.append({
"role": "user",
"content": (
"You just executed tool calls but returned an "
"empty response. Please process the tool "
"results above and continue with the task."
),
"_empty_recovery_synthetic": True,
})
continue
# ── Thinking-only prefill continuation ──────────
# The model produced structured reasoning (via API
# fields) but no visible text content. Rather than
# giving up, append the assistant message as-is and
# continue — the model will see its own reasoning
# on the next turn and produce the text portion.
# Inspired by clawdbot's "incomplete-text" recovery.
# Also covers Qwen3/Ollama in-content <think> blocks
# (detected above as _has_inline_thinking).
_has_structured = bool(
getattr(assistant_message, "reasoning", None)
or getattr(assistant_message, "reasoning_content", None)
or getattr(assistant_message, "reasoning_details", None)
or _has_inline_thinking
)
if _has_structured and self._thinking_prefill_retries < 2:
self._thinking_prefill_retries += 1
logger.info(
"Thinking-only response (no visible content) — "
"prefilling to continue (%d/2)",
self._thinking_prefill_retries,
)
self._emit_status(
f"↻ Thinking-only response — prefilling to continue "
f"({self._thinking_prefill_retries}/2)"
)
interim_msg = self._build_assistant_message(
assistant_message, "incomplete"
)
interim_msg["_thinking_prefill"] = True
messages.append(interim_msg)
self._session_messages = messages
self._save_session_log(messages)
continue
# ── Empty response retry ──────────────────────
# Model returned nothing usable. Retry up to 3
# times before attempting fallback. This covers
# both truly empty responses (no content, no
# reasoning) AND reasoning-only responses after
# prefill exhaustion — models like mimo-v2-pro
# always populate reasoning fields via OpenRouter,
# so the old `not _has_structured` guard blocked
# retries for every reasoning model after prefill.
_truly_empty = not self._strip_think_blocks(
final_response
).strip()
_prefill_exhausted = (
_has_structured
and self._thinking_prefill_retries >= 2
)
if _truly_empty and (not _has_structured or _prefill_exhausted) and self._empty_content_retries < 3:
self._empty_content_retries += 1
logger.warning(
"Empty response (no content or reasoning) — "
"retry %d/3 (model=%s)",
self._empty_content_retries, self.model,
)
self._emit_status(
f"⚠️ Empty response from model — retrying "
f"({self._empty_content_retries}/3)"
)
continue
# ── Exhausted retries — try fallback provider ──
# Before giving up with "(empty)", attempt to
# switch to the next provider in the fallback
# chain. This covers the case where a model
# (e.g. GLM-4.5-Air) consistently returns empty
# due to context degradation or provider issues.
if _truly_empty and self._fallback_chain:
logger.warning(
"Empty response after %d retries — "
"attempting fallback (model=%s, provider=%s)",
self._empty_content_retries, self.model,
self.provider,
)
self._emit_status(
"⚠️ Model returning empty responses — "
"switching to fallback provider..."
)
if self._try_activate_fallback():
self._empty_content_retries = 0
self._emit_status(
f"↻ Switched to fallback: {self.model} "
f"({self.provider})"
)
logger.info(
"Fallback activated after empty responses: "
"now using %s on %s",
self.model, self.provider,
)
continue
# Exhausted retries and fallback chain (or no
# fallback configured). Fall through to the
# "(empty)" terminal.
_turn_exit_reason = "empty_response_exhausted"
reasoning_text = self._extract_reasoning(assistant_message)
self._drop_trailing_empty_response_scaffolding(messages)
assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
assistant_msg["content"] = "(empty)"
# This is a user-facing failure sentinel for the gateway,
# not real assistant content. Persisting it makes later
# "continue" turns replay assistant("(empty)") as if it
# were a meaningful model response, which can keep long
# tool-heavy sessions stuck in empty-response loops.
assistant_msg["_empty_terminal_sentinel"] = True
messages.append(assistant_msg)
if reasoning_text:
reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text
logger.warning(
"Reasoning-only response (no visible content) "
"after exhausting retries and fallback. "
"Reasoning: %s", reasoning_preview,
)
self._emit_status(
"⚠️ Model produced reasoning but no visible "
"response after all retries. Returning empty."
)
else:
logger.warning(
"Empty response (no content or reasoning) "
"after %d retries. No fallback available. "
"model=%s provider=%s",
self._empty_content_retries, self.model,
self.provider,
)
self._emit_status(
"❌ Model returned no content after all retries"
+ (" and fallback attempts." if self._fallback_chain else
". No fallback providers configured.")
)
final_response = "(empty)"
break
# Reset retry counter/signature on successful content
self._empty_content_retries = 0
self._thinking_prefill_retries = 0
if (
self.api_mode == "codex_responses"
and self.valid_tool_names
and codex_ack_continuations < 2
and self._looks_like_codex_intermediate_ack(
user_message=user_message,
assistant_content=final_response,
messages=messages,
)
):
codex_ack_continuations += 1
interim_msg = self._build_assistant_message(assistant_message, "incomplete")
messages.append(interim_msg)
self._emit_interim_assistant_message(interim_msg)
continue_msg = {
"role": "user",
"content": (
"[System: Continue now. Execute the required tool calls and only "
"send your final answer after completing the task.]"
),
}
messages.append(continue_msg)
self._session_messages = messages
self._save_session_log(messages)
continue
codex_ack_continuations = 0
if truncated_response_prefix:
final_response = truncated_response_prefix + final_response
truncated_response_prefix = ""
length_continue_retries = 0
final_response = self._strip_think_blocks(final_response).strip()
final_msg = self._build_assistant_message(assistant_message, finish_reason)
# Pop thinking-only prefill and empty-response retry
# scaffolding before appending the final response. These
# internal turns are only for the next API retry and should
# not become durable transcript context.
while (
messages
and isinstance(messages[-1], dict)
and (
messages[-1].get("_thinking_prefill")
or messages[-1].get("_empty_recovery_synthetic")
or messages[-1].get("_empty_terminal_sentinel")
)
):
messages.pop()
messages.append(final_msg)
_turn_exit_reason = f"text_response(finish_reason={finish_reason})"
if not self.quiet_mode:
self._safe_print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)")
break
except Exception as e:
error_msg = f"Error during OpenAI-compatible API call #{api_call_count}: {str(e)}"
try:
print(f"{error_msg}")
except (OSError, ValueError):
logger.error(error_msg)
logger.debug("Outer loop error in API call #%d", api_call_count, exc_info=True)
# If an assistant message with tool_calls was already appended,
# the API expects a role="tool" result for every tool_call_id.
# Fill in error results for any that weren't answered yet.
for idx in range(len(messages) - 1, -1, -1):
msg = messages[idx]
if not isinstance(msg, dict):
break
if msg.get("role") == "tool":
continue
if msg.get("role") == "assistant" and msg.get("tool_calls"):
answered_ids = {
m["tool_call_id"]
for m in messages[idx + 1:]
if isinstance(m, dict) and m.get("role") == "tool"
}
for tc in msg["tool_calls"]:
if not tc or not isinstance(tc, dict): continue
if tc["id"] not in answered_ids:
err_msg = {
"role": "tool",
"name": AIAgent._get_tool_call_name_static(tc),
"tool_call_id": tc["id"],
"content": f"Error executing tool: {error_msg}",
}
messages.append(err_msg)
break
# Non-tool errors don't need a synthetic message injected.
# The error is already printed to the user (line above), and
# the retry loop continues. Injecting a fake user/assistant
# message pollutes history, burns tokens, and risks violating
# role-alternation invariants.
# If we're near the limit, break to avoid infinite loops
if api_call_count >= self.max_iterations - 1:
_turn_exit_reason = f"error_near_max_iterations({error_msg[:80]})"
final_response = f"I apologize, but I encountered repeated errors: {error_msg}"
# Append as assistant so the history stays valid for
# session resume (avoids consecutive user messages).
messages.append({"role": "assistant", "content": final_response})
break
if final_response is None and (
api_call_count >= self.max_iterations
or self.iteration_budget.remaining <= 0
):
# Budget exhausted — ask the model for a summary via one extra
# API call with tools stripped. _handle_max_iterations injects a
# user message and makes a single toolless request.
_turn_exit_reason = f"max_iterations_reached({api_call_count}/{self.max_iterations})"
self._emit_status(
f"⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) "
"— asking model to summarise"
)
if not self.quiet_mode:
self._safe_print(
f"\n⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) "
"— requesting summary..."
)
final_response = self._handle_max_iterations(messages, api_call_count)
# If running as a kanban worker, block the task so the dispatcher
# knows the worker could not complete (rather than treating it as a
# protocol violation). The agent loop strips tools before calling
# _handle_max_iterations, so the model cannot call kanban_block
# itself — we must do it on its behalf.
_kanban_task = os.environ.get("HERMES_KANBAN_TASK")
if _kanban_task:
try:
handle_function_call(
"kanban_block",
{
"task_id": _kanban_task,
"reason": (
f"Iteration budget exhausted "
f"({api_call_count}/{self.max_iterations}) — "
"task could not complete within the allowed "
"iterations"
),
},
task_id=effective_task_id,
)
logger.info(
"kanban_block called for task %s after iteration "
"exhaustion (%d/%d)",
_kanban_task, api_call_count, self.max_iterations,
)
except Exception:
logger.warning(
"Failed to call kanban_block after iteration "
"exhaustion for task %s",
_kanban_task,
exc_info=True,
)
# Determine if conversation completed successfully
completed = final_response is not None and api_call_count < self.max_iterations
# Save trajectory if enabled. ``user_message`` may be a multimodal
# list of parts; the trajectory format wants a plain string.
self._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed)
# Clean up VM and browser for this task after conversation completes
self._cleanup_task_resources(effective_task_id)
# Persist session to both JSON log and SQLite only after private retry
# scaffolding has been removed. Otherwise a later user "continue" turn
# can replay assistant("(empty)") / recovery nudges and fall into the
# same empty-response loop again.
self._drop_trailing_empty_response_scaffolding(messages)
self._persist_session(messages, conversation_history)
# ── Turn-exit diagnostic log ─────────────────────────────────────
# Always logged at INFO so agent.log captures WHY every turn ended.
# When the last message is a tool result (agent was mid-work), log
# at WARNING — this is the "just stops" scenario users report.
_last_msg_role = messages[-1].get("role") if messages else None
_last_tool_name = None
if _last_msg_role == "tool":
# Walk back to find the assistant message with the tool call
for _m in reversed(messages):
if _m.get("role") == "assistant" and _m.get("tool_calls"):
_tcs = _m["tool_calls"]
if _tcs and isinstance(_tcs[0], dict):
_last_tool_name = _tcs[-1].get("function", {}).get("name")
break
_turn_tool_count = sum(
1 for m in messages
if isinstance(m, dict) and m.get("role") == "assistant" and m.get("tool_calls")
)
_resp_len = len(final_response) if final_response else 0
_budget_used = self.iteration_budget.used if self.iteration_budget else 0
_budget_max = self.iteration_budget.max_total if self.iteration_budget else 0
_diag_msg = (
"Turn ended: reason=%s model=%s api_calls=%d/%d budget=%d/%d "
"tool_turns=%d last_msg_role=%s response_len=%d session=%s"
)
_diag_args = (
_turn_exit_reason, self.model, api_call_count, self.max_iterations,
_budget_used, _budget_max,
_turn_tool_count, _last_msg_role, _resp_len,
self.session_id or "none",
)
if _last_msg_role == "tool" and not interrupted:
# Agent was mid-work — this is the "just stops" case.
logger.warning(
"Turn ended with pending tool result (agent may appear stuck). "
+ _diag_msg + " last_tool=%s",
*_diag_args, _last_tool_name,
)
else:
logger.info(_diag_msg, *_diag_args)
# File-mutation verifier footer.
# If one or more ``write_file`` / ``patch`` calls failed during this
# turn and were never superseded by a successful write to the same
# path, append an advisory footer to the assistant response. This
# catches the specific case — reported by Ben Eng (#15524-adjacent)
# — where a model issues a batch of parallel patches, half of them
# fail with "Could not find old_string", and the model summarises
# the turn claiming every file was edited. The user then has to
# manually run ``git status`` to catch the lie. With this footer
# the truth is surfaced on every turn, so over-claiming is
# structurally impossible past the model.
#
# Gate: only applied when a real text response exists for this
# turn and the user didn't interrupt. Empty/interrupted turns
# already have other surface text that shouldn't be augmented.
if final_response and not interrupted:
try:
_failed = getattr(self, "_turn_failed_file_mutations", None) or {}
if _failed and self._file_mutation_verifier_enabled():
footer = self._format_file_mutation_failure_footer(_failed)
if footer:
final_response = final_response.rstrip() + "\n\n" + footer
except Exception as _ver_err:
logger.debug("file-mutation verifier footer failed: %s", _ver_err)
# Plugin hook: transform_llm_output
# Fired once per turn after the tool-calling loop completes.
# Plugins can transform the LLM's output text before it's returned.
# First hook to return a string wins; None/empty return leaves text unchanged.
if final_response and not interrupted:
try:
from hermes_cli.plugins import invoke_hook as _invoke_hook
_transform_results = _invoke_hook(
"transform_llm_output",
response_text=final_response,
session_id=self.session_id or "",
model=self.model,
platform=getattr(self, "platform", None) or "",
)
for _hook_result in _transform_results:
if isinstance(_hook_result, str) and _hook_result:
final_response = _hook_result
break # First non-empty string wins
except Exception as exc:
logger.warning("transform_llm_output hook failed: %s", exc)
# Plugin hook: post_llm_call
# Fired once per turn after the tool-calling loop completes.
# Plugins can use this to persist conversation data (e.g. sync
# to an external memory system).
if final_response and not interrupted:
try:
from hermes_cli.plugins import invoke_hook as _invoke_hook
_invoke_hook(
"post_llm_call",
session_id=self.session_id,
user_message=original_user_message,
assistant_response=final_response,
conversation_history=list(messages),
model=self.model,
platform=getattr(self, "platform", None) or "",
)
except Exception as exc:
logger.warning("post_llm_call hook failed: %s", exc)
# Extract reasoning from the CURRENT turn only. Walk backwards
# but stop at the user message that started this turn — anything
# earlier is from a prior turn and must not leak into the reasoning
# box (confusing stale display; #17055). Within the current turn
# we still want the *most recent* non-empty reasoning: many
# providers (Claude thinking, DeepSeek v4, Codex Responses) emit
# reasoning on the tool-call step and leave the final-answer step
# with reasoning=None, so picking only the last assistant would
# silently drop legitimate same-turn reasoning.
last_reasoning = None
for msg in reversed(messages):
if msg.get("role") == "user":
break # turn boundary — don't cross into prior turns
if msg.get("role") == "assistant" and msg.get("reasoning"):
last_reasoning = msg["reasoning"]
break
# Build result with interrupt info if applicable
result = {
"final_response": final_response,
"last_reasoning": last_reasoning,
"messages": messages,
"api_calls": api_call_count,
"completed": completed,
"turn_exit_reason": _turn_exit_reason,
"partial": False, # True only when stopped due to invalid tool calls
"interrupted": interrupted,
"response_previewed": getattr(self, "_response_was_previewed", False),
"model": self.model,
"provider": self.provider,
"base_url": self.base_url,
"input_tokens": self.session_input_tokens,
"output_tokens": self.session_output_tokens,
"cache_read_tokens": self.session_cache_read_tokens,
"cache_write_tokens": self.session_cache_write_tokens,
"reasoning_tokens": self.session_reasoning_tokens,
"prompt_tokens": self.session_prompt_tokens,
"completion_tokens": self.session_completion_tokens,
"total_tokens": self.session_total_tokens,
"last_prompt_tokens": getattr(self.context_compressor, "last_prompt_tokens", 0) or 0,
"estimated_cost_usd": self.session_estimated_cost_usd,
"cost_status": self.session_cost_status,
"cost_source": self.session_cost_source,
}
if self._tool_guardrail_halt_decision is not None:
result["guardrail"] = self._tool_guardrail_halt_decision.to_metadata()
# If a /steer landed after the final assistant turn (no more tool
# batches to drain into), hand it back to the caller so it can be
# delivered as the next user turn instead of being silently lost.
_leftover_steer = self._drain_pending_steer()
if _leftover_steer:
result["pending_steer"] = _leftover_steer
self._response_was_previewed = False
# Include interrupt message if one triggered the interrupt
if interrupted and self._interrupt_message:
result["interrupt_message"] = self._interrupt_message
# Clear interrupt state after handling
self.clear_interrupt()
# Clear stream callback so it doesn't leak into future calls
self._stream_callback = None
# Check skill trigger NOW — based on how many tool iterations THIS turn used.
_should_review_skills = False
if (self._skill_nudge_interval > 0
and self._iters_since_skill >= self._skill_nudge_interval
and "skill_manage" in self.valid_tool_names):
_should_review_skills = True
self._iters_since_skill = 0
# External memory provider: sync the completed turn + queue next prefetch.
self._sync_external_memory_for_turn(
original_user_message=original_user_message,
final_response=final_response,
interrupted=interrupted,
)
# Background memory/skill review — runs AFTER the response is delivered
# so it never competes with the user's task for model attention.
if final_response and not interrupted and (_should_review_memory or _should_review_skills):
try:
self._spawn_background_review(
messages_snapshot=list(messages),
review_memory=_should_review_memory,
review_skills=_should_review_skills,
)
except Exception:
pass # Background review is best-effort
# Note: Memory provider on_session_end() + shutdown_all() are NOT
# called here — run_conversation() is called once per user message in
# multi-turn sessions. Shutting down after every turn would kill the
# provider before the second message. Actual session-end cleanup is
# handled by the CLI (atexit / /reset) and gateway (session expiry /
# _reset_session).
# Plugin hook: on_session_end
# Fired at the very end of every run_conversation call.
# Plugins can use this for cleanup, flushing buffers, etc.
try:
from hermes_cli.plugins import invoke_hook as _invoke_hook
_invoke_hook(
"on_session_end",
session_id=self.session_id,
completed=completed,
interrupted=interrupted,
model=self.model,
platform=getattr(self, "platform", None) or "",
)
except Exception as exc:
logger.warning("on_session_end hook failed: %s", exc)
return result
def chat(self, message: str, stream_callback: Optional[callable] = None) -> str:
"""
Simple chat interface that returns just the final response.
Args:
message (str): User message
stream_callback: Optional callback invoked with each text delta during streaming.
Returns:
str: Final assistant response
"""
result = self.run_conversation(message, stream_callback=stream_callback)
return result["final_response"]
def _run_codex_app_server_turn(
self,
*,
user_message: str,
original_user_message: Any,
messages: List[Dict[str, Any]],
effective_task_id: str,
should_review_memory: bool = False,
) -> Dict[str, Any]:
"""Forwarder — see ``agent.codex_runtime.run_codex_app_server_turn``."""
from agent.codex_runtime import run_codex_app_server_turn
return run_codex_app_server_turn(self, user_message=user_message, original_user_message=original_user_message, messages=messages, effective_task_id=effective_task_id, should_review_memory=should_review_memory)
def main(
query: str = None,
model: str = "",
api_key: str = None,
base_url: str = "",
max_turns: int = 10,
enabled_toolsets: str = None,
disabled_toolsets: str = None,
list_tools: bool = False,
save_trajectories: bool = False,
save_sample: bool = False,
verbose: bool = False,
log_prefix_chars: int = 20
):
"""
Main function for running the agent directly.
Args:
query (str): Natural language query for the agent. Defaults to Python 3.13 example.
model (str): Model name to use (OpenRouter format: provider/model). Defaults to anthropic/claude-sonnet-4.6.
api_key (str): API key for authentication. Uses OPENROUTER_API_KEY env var if not provided.
base_url (str): Base URL for the model API. Defaults to https://openrouter.ai/api/v1
max_turns (int): Maximum number of API call iterations. Defaults to 10.
enabled_toolsets (str): Comma-separated list of toolsets to enable. Supports predefined
toolsets (e.g., "research", "development", "safe").
Multiple toolsets can be combined: "web,vision"
disabled_toolsets (str): Comma-separated list of toolsets to disable (e.g., "terminal")
list_tools (bool): Just list available tools and exit
save_trajectories (bool): Save conversation trajectories to JSONL files (appends to trajectory_samples.jsonl). Defaults to False.
save_sample (bool): Save a single trajectory sample to a UUID-named JSONL file for inspection. Defaults to False.
verbose (bool): Enable verbose logging for debugging. Defaults to False.
log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses. Defaults to 20.
Toolset Examples:
- "research": Web search, extract, crawl + vision tools
"""
print("🤖 AI Agent with Tool Calling")
print("=" * 50)
# Handle tool listing
if list_tools:
from model_tools import get_all_tool_names, get_available_toolsets
from toolsets import get_all_toolsets, get_toolset_info
print("📋 Available Tools & Toolsets:")
print("-" * 50)
# Show new toolsets system
print("\n🎯 Predefined Toolsets (New System):")
print("-" * 40)
all_toolsets = get_all_toolsets()
# Group by category
basic_toolsets = []
composite_toolsets = []
scenario_toolsets = []
for name, toolset in all_toolsets.items():
info = get_toolset_info(name)
if info:
entry = (name, info)
if name in {"web", "terminal", "vision", "creative", "reasoning"}:
basic_toolsets.append(entry)
elif name in {"research", "development", "analysis", "content_creation", "full_stack"}:
composite_toolsets.append(entry)
else:
scenario_toolsets.append(entry)
# Print basic toolsets
print("\n📌 Basic Toolsets:")
for name, info in basic_toolsets:
tools_str = ', '.join(info['resolved_tools']) if info['resolved_tools'] else 'none'
print(f"{name:15} - {info['description']}")
print(f" Tools: {tools_str}")
# Print composite toolsets
print("\n📂 Composite Toolsets (built from other toolsets):")
for name, info in composite_toolsets:
includes_str = ', '.join(info['includes']) if info['includes'] else 'none'
print(f"{name:15} - {info['description']}")
print(f" Includes: {includes_str}")
print(f" Total tools: {info['tool_count']}")
# Print scenario-specific toolsets
print("\n🎭 Scenario-Specific Toolsets:")
for name, info in scenario_toolsets:
print(f"{name:20} - {info['description']}")
print(f" Total tools: {info['tool_count']}")
# Show legacy toolset compatibility
print("\n📦 Legacy Toolsets (for backward compatibility):")
legacy_toolsets = get_available_toolsets()
for name, info in legacy_toolsets.items():
status = "" if info["available"] else ""
print(f" {status} {name}: {info['description']}")
if not info["available"]:
print(f" Requirements: {', '.join(info['requirements'])}")
# Show individual tools
all_tools = get_all_tool_names()
print(f"\n🔧 Individual Tools ({len(all_tools)} available):")
for tool_name in sorted(all_tools):
toolset = get_toolset_for_tool(tool_name)
print(f" 📌 {tool_name} (from {toolset})")
print("\n💡 Usage Examples:")
print(" # Use predefined toolsets")
print(" python run_agent.py --enabled_toolsets=research --query='search for Python news'")
print(" python run_agent.py --enabled_toolsets=development --query='debug this code'")
print(" python run_agent.py --enabled_toolsets=safe --query='analyze without terminal'")
print(" ")
print(" # Combine multiple toolsets")
print(" python run_agent.py --enabled_toolsets=web,vision --query='analyze website'")
print(" ")
print(" # Disable toolsets")
print(" python run_agent.py --disabled_toolsets=terminal --query='no command execution'")
print(" ")
print(" # Run with trajectory saving enabled")
print(" python run_agent.py --save_trajectories --query='your question here'")
return
# Parse toolset selection arguments
enabled_toolsets_list = None
disabled_toolsets_list = None
if enabled_toolsets:
enabled_toolsets_list = [t.strip() for t in enabled_toolsets.split(",")]
print(f"🎯 Enabled toolsets: {enabled_toolsets_list}")
if disabled_toolsets:
disabled_toolsets_list = [t.strip() for t in disabled_toolsets.split(",")]
print(f"🚫 Disabled toolsets: {disabled_toolsets_list}")
if save_trajectories:
print("💾 Trajectory saving: ENABLED")
print(" - Successful conversations → trajectory_samples.jsonl")
print(" - Failed conversations → failed_trajectories.jsonl")
# Initialize agent with provided parameters
try:
agent = AIAgent(
base_url=base_url,
model=model,
api_key=api_key,
max_iterations=max_turns,
enabled_toolsets=enabled_toolsets_list,
disabled_toolsets=disabled_toolsets_list,
save_trajectories=save_trajectories,
verbose_logging=verbose,
log_prefix_chars=log_prefix_chars
)
except RuntimeError as e:
print(f"❌ Failed to initialize agent: {e}")
return
# Use provided query or default to Python 3.13 example
if query is None:
user_query = (
"Tell me about the latest developments in Python 3.13 and what new features "
"developers should know about. Please search for current information and try it out."
)
else:
user_query = query
print(f"\n📝 User Query: {user_query}")
print("\n" + "=" * 50)
# Run conversation
result = agent.run_conversation(user_query)
print("\n" + "=" * 50)
print("📋 CONVERSATION SUMMARY")
print("=" * 50)
print(f"✅ Completed: {result['completed']}")
print(f"📞 API Calls: {result['api_calls']}")
print(f"💬 Messages: {len(result['messages'])}")
if result['final_response']:
print("\n🎯 FINAL RESPONSE:")
print("-" * 30)
print(result['final_response'])
# Save sample trajectory to UUID-named file if requested
if save_sample:
sample_id = str(uuid.uuid4())[:8]
sample_filename = f"sample_{sample_id}.json"
# Convert messages to trajectory format (same as batch_runner)
trajectory = agent._convert_to_trajectory_format(
result['messages'],
user_query,
result['completed']
)
entry = {
"conversations": trajectory,
"timestamp": datetime.now().isoformat(),
"model": model,
"completed": result['completed'],
"query": user_query
}
try:
with open(sample_filename, "w", encoding="utf-8") as f:
# Pretty-print JSON with indent for readability
f.write(json.dumps(entry, ensure_ascii=False, indent=2))
print(f"\n💾 Sample trajectory saved to: {sample_filename}")
except Exception as e:
print(f"\n⚠️ Failed to save sample: {e}")
print("\n👋 Agent execution completed!")
if __name__ == "__main__":
import fire
fire.Fire(main)