Merge remote-tracking branch 'origin/main' into sid/types-and-lints

# Conflicts:
#	gateway/run.py
#	tools/delegate_tool.py
This commit is contained in:
alt-glitch 2026-04-22 14:27:37 +05:30
commit 847ffca715
171 changed files with 15125 additions and 1675 deletions

View file

@ -1178,7 +1178,7 @@ class AIAgent:
client_kwargs["default_headers"] = copilot_default_headers()
elif base_url_host_matches(effective_base, "api.kimi.com"):
client_kwargs["default_headers"] = {
"User-Agent": "KimiCLI/1.30.0",
"User-Agent": "claude-code/0.1.0",
}
elif base_url_host_matches(effective_base, "portal.qwen.ai"):
client_kwargs["default_headers"] = _qwen_portal_headers()
@ -2008,6 +2008,22 @@ class AIAgent:
self._fallback_activated = False
self._fallback_index = 0
# When the user deliberately swaps primary providers (e.g. openrouter
# → anthropic), drop any fallback entries that target the OLD primary
# or the NEW one. The chain was seeded from config at agent init for
# the original provider — without pruning, a failed turn on the new
# primary silently re-activates the provider the user just rejected,
# which is exactly what was reported during TUI v2 blitz testing
# ("switched to anthropic, tui keeps trying openrouter").
old_norm = (old_provider or "").strip().lower()
new_norm = (new_provider or "").strip().lower()
if old_norm and new_norm and old_norm != new_norm:
self._fallback_chain = [
entry for entry in self._fallback_chain
if (entry.get("provider") or "").strip().lower() not in {old_norm, new_norm}
]
self._fallback_model = self._fallback_chain[0] if self._fallback_chain else None
logging.info(
"Model switched in-place: %s (%s) -> %s (%s)",
old_model, old_provider, new_model, new_provider,
@ -4295,10 +4311,6 @@ class AIAgent:
if self._memory_store:
self._memory_store.load_from_disk()
def _responses_tools(self, tools: Optional[List[Dict[str, Any]]] = None) -> Optional[List[Dict[str, Any]]]:
"""Convert chat-completions tool schemas to Responses function-tool schemas."""
return _codex_responses_tools(tools if tools is not None else self.tools)
@staticmethod
def _deterministic_call_id(fn_name: str, arguments: str, index: int = 0) -> str:
"""Generate a deterministic call_id from tool call content.
@ -4322,33 +4334,6 @@ class AIAgent:
"""Build a valid Responses `function_call.id` (must start with `fc_`)."""
return _codex_derive_responses_function_call_id(call_id, response_item_id)
def _chat_messages_to_responses_input(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert internal chat-style messages to Responses input items."""
return _codex_chat_messages_to_responses_input(messages)
def _preflight_codex_input_items(self, raw_items: Any) -> List[Dict[str, Any]]:
return _codex_preflight_codex_input_items(raw_items)
def _preflight_codex_api_kwargs(
self,
api_kwargs: Any,
*,
allow_stream: bool = False,
) -> Dict[str, Any]:
return _codex_preflight_codex_api_kwargs(api_kwargs, allow_stream=allow_stream)
def _extract_responses_message_text(self, item: Any) -> str:
"""Extract assistant text from a Responses message output item."""
return _codex_extract_responses_message_text(item)
def _extract_responses_reasoning_text(self, item: Any) -> str:
"""Extract a compact reasoning text from a Responses reasoning item."""
return _codex_extract_responses_reasoning_text(item)
def _normalize_codex_response(self, response: Any) -> tuple[Any, str]:
"""Normalize a Responses API object to an assistant_message-like object."""
return _codex_normalize_codex_response(response)
def _thread_identity(self) -> str:
thread = threading.current_thread()
return f"{thread.name}:{thread.ident}"
@ -4841,7 +4826,7 @@ class AIAgent:
active_client = client or self._ensure_primary_openai_client(reason="codex_create_stream_fallback")
fallback_kwargs = dict(api_kwargs)
fallback_kwargs["stream"] = True
fallback_kwargs = self._preflight_codex_api_kwargs(fallback_kwargs, allow_stream=True)
fallback_kwargs = self._get_codex_transport().preflight_kwargs(fallback_kwargs, allow_stream=True)
stream_or_response = active_client.responses.create(**fallback_kwargs)
# Compatibility shim for mocks or providers that still return a concrete response.
@ -5036,7 +5021,7 @@ class AIAgent:
self._client_kwargs["default_headers"] = copilot_default_headers()
elif base_url_host_matches(base_url, "api.kimi.com"):
self._client_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.30.0"}
self._client_kwargs["default_headers"] = {"User-Agent": "claude-code/0.1.0"}
elif base_url_host_matches(base_url, "portal.qwen.ai"):
self._client_kwargs["default_headers"] = _qwen_portal_headers()
elif base_url_host_matches(base_url, "chatgpt.com"):
@ -6583,6 +6568,33 @@ class AIAgent:
self._anthropic_transport = t
return t
def _get_codex_transport(self):
"""Return the cached ResponsesApiTransport instance (lazy singleton)."""
t = getattr(self, "_codex_transport", None)
if t is None:
from agent.transports import get_transport
t = get_transport("codex_responses")
self._codex_transport = t
return t
def _get_chat_completions_transport(self):
"""Return the cached ChatCompletionsTransport instance (lazy singleton)."""
t = getattr(self, "_chat_completions_transport", None)
if t is None:
from agent.transports import get_transport
t = get_transport("chat_completions")
self._chat_completions_transport = t
return t
def _get_bedrock_transport(self):
"""Return the cached BedrockTransport instance (lazy singleton)."""
t = getattr(self, "_bedrock_transport", None)
if t is None:
from agent.transports import get_transport
t = get_transport("bedrock_converse")
self._bedrock_transport = t
return t
def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list:
if not any(
isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
@ -6722,31 +6734,20 @@ class AIAgent:
# AWS Bedrock native Converse API — bypasses the OpenAI client entirely.
# The adapter handles message/tool conversion and boto3 calls directly.
if self.api_mode == "bedrock_converse":
from agent.bedrock_adapter import build_converse_kwargs
_bt = self._get_bedrock_transport()
region = getattr(self, "_bedrock_region", None) or "us-east-1"
guardrail = getattr(self, "_bedrock_guardrail_config", None)
return {
"__bedrock_converse__": True,
"__bedrock_region__": region,
**build_converse_kwargs(
model=self.model,
messages=api_messages,
tools=self.tools,
max_tokens=self.max_tokens or 4096,
temperature=None, # Let the model use its default
guardrail_config=guardrail,
),
}
return _bt.build_kwargs(
model=self.model,
messages=api_messages,
tools=self.tools,
max_tokens=self.max_tokens or 4096,
region=region,
guardrail_config=guardrail,
)
if self.api_mode == "codex_responses":
instructions = ""
payload_messages = api_messages
if api_messages and api_messages[0].get("role") == "system":
instructions = str(api_messages[0].get("content") or "").strip()
payload_messages = api_messages[1:]
if not instructions:
instructions = DEFAULT_AGENT_IDENTITY
_ct = self._get_codex_transport()
is_github_responses = (
base_url_host_matches(self.base_url, "models.github.ai")
or base_url_host_matches(self.base_url, "api.githubcopilot.com")
@ -6758,320 +6759,118 @@ class AIAgent:
and "/backend-api/codex" in self._base_url_lower
)
)
# Resolve reasoning effort: config > default (medium)
reasoning_effort = "medium"
reasoning_enabled = True
if self.reasoning_config and isinstance(self.reasoning_config, dict):
if self.reasoning_config.get("enabled") is False:
reasoning_enabled = False
elif self.reasoning_config.get("effort"):
reasoning_effort = self.reasoning_config["effort"]
# Clamp effort levels not supported by the Responses API model.
# GPT-5.4 supports none/low/medium/high/xhigh but not "minimal".
# "minimal" is valid on OpenRouter and GPT-5 but fails on 5.2/5.4.
_effort_clamp = {"minimal": "low"}
reasoning_effort = _effort_clamp.get(reasoning_effort, reasoning_effort)
kwargs = {
"model": self.model,
"instructions": instructions,
"input": self._chat_messages_to_responses_input(payload_messages),
"tools": self._responses_tools(),
"tool_choice": "auto",
"parallel_tool_calls": True,
"store": False,
}
if not is_github_responses:
kwargs["prompt_cache_key"] = self.session_id
is_xai_responses = self.provider == "xai" or self._base_url_hostname == "api.x.ai"
if reasoning_enabled and is_xai_responses:
# xAI reasons automatically — no effort param, just include encrypted content
kwargs["include"] = ["reasoning.encrypted_content"]
elif reasoning_enabled:
if is_github_responses:
# Copilot's Responses route advertises reasoning-effort support,
# but not OpenAI-specific prompt cache or encrypted reasoning
# fields. Keep the payload to the documented subset.
github_reasoning = self._github_models_reasoning_extra_body()
if github_reasoning is not None:
kwargs["reasoning"] = github_reasoning
else:
kwargs["reasoning"] = {"effort": reasoning_effort, "summary": "auto"}
kwargs["include"] = ["reasoning.encrypted_content"]
elif not is_github_responses and not is_xai_responses:
kwargs["include"] = []
if self.request_overrides:
kwargs.update(self.request_overrides)
if self.max_tokens is not None and not is_codex_backend:
kwargs["max_output_tokens"] = self.max_tokens
if is_xai_responses and getattr(self, "session_id", None):
kwargs["extra_headers"] = {"x-grok-conv-id": self.session_id}
return kwargs
sanitized_messages = api_messages
needs_sanitization = False
for msg in api_messages:
if not isinstance(msg, dict):
continue
if "codex_reasoning_items" in msg:
needs_sanitization = True
break
tool_calls = msg.get("tool_calls")
if isinstance(tool_calls, list):
for tool_call in tool_calls:
if not isinstance(tool_call, dict):
continue
if "call_id" in tool_call or "response_item_id" in tool_call:
needs_sanitization = True
break
if needs_sanitization:
break
if needs_sanitization:
sanitized_messages = copy.deepcopy(api_messages)
for msg in sanitized_messages:
if not isinstance(msg, dict):
continue
# Codex-only replay state must not leak into strict chat-completions APIs.
msg.pop("codex_reasoning_items", None)
tool_calls = msg.get("tool_calls")
if isinstance(tool_calls, list):
for tool_call in tool_calls:
if isinstance(tool_call, dict):
tool_call.pop("call_id", None)
tool_call.pop("response_item_id", None)
# Qwen portal: normalize content to list-of-dicts, inject cache_control.
# Must run AFTER codex sanitization so we transform the final messages.
# If sanitization already deepcopied, reuse that copy (in-place).
if self._is_qwen_portal():
if sanitized_messages is api_messages:
# No sanitization was done — we need our own copy.
sanitized_messages = self._qwen_prepare_chat_messages(sanitized_messages)
else:
# Already a deepcopy — transform in place to avoid a second deepcopy.
self._qwen_prepare_chat_messages_inplace(sanitized_messages)
# GPT-5 and Codex models respond better to 'developer' than 'system'
# for instruction-following. Swap the role at the API boundary so
# internal message representation stays uniform ("system").
_model_lower = (self.model or "").lower()
if (
sanitized_messages
and sanitized_messages[0].get("role") == "system"
and any(p in _model_lower for p in DEVELOPER_ROLE_MODELS)
):
# Shallow-copy the list + first message only — rest stays shared.
sanitized_messages = list(sanitized_messages)
sanitized_messages[0] = {**sanitized_messages[0], "role": "developer"}
provider_preferences = {}
if self.providers_allowed:
provider_preferences["only"] = self.providers_allowed
if self.providers_ignored:
provider_preferences["ignore"] = self.providers_ignored
if self.providers_order:
provider_preferences["order"] = self.providers_order
if self.provider_sort:
provider_preferences["sort"] = self.provider_sort
if self.provider_require_parameters:
provider_preferences["require_parameters"] = True
if self.provider_data_collection:
provider_preferences["data_collection"] = self.provider_data_collection
api_kwargs = {
"model": self.model,
"messages": sanitized_messages,
"timeout": self._resolved_api_call_timeout(),
}
try:
from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE
except Exception:
_fixed_temperature_for_model = None
OMIT_TEMPERATURE = None
if _fixed_temperature_for_model is not None:
fixed_temperature = _fixed_temperature_for_model(self.model, self.base_url)
if fixed_temperature is OMIT_TEMPERATURE:
api_kwargs.pop("temperature", None)
elif fixed_temperature is not None:
api_kwargs["temperature"] = fixed_temperature
if self._is_qwen_portal():
api_kwargs["metadata"] = {
"sessionId": self.session_id or "hermes",
"promptId": str(uuid.uuid4()),
}
if self.tools:
api_kwargs["tools"] = self.tools
# ── max_tokens for chat_completions ──────────────────────────────
# Priority: ephemeral override (error recovery / length-continuation
# boost) > user-configured max_tokens > provider-specific defaults.
_ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None)
if _ephemeral_out is not None:
self._ephemeral_max_output_tokens = None # consume immediately
api_kwargs.update(self._max_tokens_param(_ephemeral_out))
elif self.max_tokens is not None:
api_kwargs.update(self._max_tokens_param(self.max_tokens))
elif "integrate.api.nvidia.com" in self._base_url_lower:
# NVIDIA NIM defaults to a very low max_tokens when omitted,
# causing models like GLM-4.7 to truncate immediately (thinking
# tokens alone exhaust the budget). 16384 provides adequate room.
api_kwargs.update(self._max_tokens_param(16384))
elif self._is_qwen_portal():
# Qwen Portal defaults to a very low max_tokens when omitted.
# Reasoning models (qwen3-coder-plus) exhaust that budget on
# thinking tokens alone, causing the portal to return
# finish_reason="stop" with truncated output — the agent sees
# this as an intentional stop and exits the loop. Send 65536
# (the documented max output for qwen3-coder models) so the
# model has adequate output budget for tool calls.
api_kwargs.update(self._max_tokens_param(65536))
elif (
base_url_host_matches(self.base_url, "api.kimi.com")
or base_url_host_matches(self.base_url, "moonshot.ai")
or base_url_host_matches(self.base_url, "moonshot.cn")
):
# Kimi/Moonshot defaults to a low max_tokens when omitted.
# Reasoning tokens share the output budget — without an explicit
# value the model can exhaust it on thinking alone, causing
# "Response truncated due to output length limit". 32000 matches
# Kimi CLI's default (see MoonshotAI/kimi-cli kimi.py generate()).
api_kwargs.update(self._max_tokens_param(32000))
# Kimi requires reasoning_effort as a top-level chat completions
# parameter (not inside extra_body). Mirror Kimi CLI's
# with_generation_kwargs(reasoning_effort=...) / with_thinking():
# when thinking is disabled, Kimi CLI omits reasoning_effort
# entirely (maps to None).
_kimi_thinking_off = bool(
self.reasoning_config
and isinstance(self.reasoning_config, dict)
and self.reasoning_config.get("enabled") is False
return _ct.build_kwargs(
model=self.model,
messages=api_messages,
tools=self.tools,
reasoning_config=self.reasoning_config,
session_id=getattr(self, "session_id", None),
max_tokens=self.max_tokens,
request_overrides=self.request_overrides,
is_github_responses=is_github_responses,
is_codex_backend=is_codex_backend,
is_xai_responses=is_xai_responses,
github_reasoning_extra=self._github_models_reasoning_extra_body() if is_github_responses else None,
)
if not _kimi_thinking_off:
_kimi_effort = "medium"
if self.reasoning_config and isinstance(self.reasoning_config, dict):
_e = (self.reasoning_config.get("effort") or "").strip().lower()
if _e in ("low", "medium", "high"):
_kimi_effort = _e
api_kwargs["reasoning_effort"] = _kimi_effort
elif (self._is_openrouter_url() or "nousresearch" in self._base_url_lower) and "claude" in (self.model or "").lower():
# OpenRouter and Nous Portal translate requests to Anthropic's
# Messages API, which requires max_tokens as a mandatory field.
# When we omit it, the proxy picks a default that can be too
# low — the model spends its output budget on thinking and has
# almost nothing left for the actual response (especially large
# tool calls like write_file). Sending the model's real output
# limit ensures full capacity.
try:
from agent.anthropic_adapter import _get_anthropic_max_output
_model_output_limit = _get_anthropic_max_output(self.model)
api_kwargs["max_tokens"] = _model_output_limit
except Exception:
pass # fail open — let the proxy pick its default
extra_body = {}
# ── chat_completions (default) ─────────────────────────────────────
_ct = self._get_chat_completions_transport()
_is_openrouter = self._is_openrouter_url()
_is_github_models = (
# Provider detection flags
_is_qwen = self._is_qwen_portal()
_is_or = self._is_openrouter_url()
_is_gh = (
base_url_host_matches(self._base_url_lower, "models.github.ai")
or base_url_host_matches(self._base_url_lower, "api.githubcopilot.com")
)
# Provider preferences (only, ignore, order, sort) are OpenRouter-
# specific. Only send to OpenRouter-compatible endpoints.
# TODO: Nous Portal will add transparent proxy support — re-enable
# for _is_nous when their backend is updated.
if provider_preferences and _is_openrouter:
extra_body["provider"] = provider_preferences
_is_nous = "nousresearch" in self._base_url_lower
# Kimi/Moonshot API uses extra_body.thinking (separate from the
# top-level reasoning_effort) to enable/disable reasoning mode.
# Mirror Kimi CLI's with_thinking() behavior exactly — see
# MoonshotAI/kimi-cli packages/kosong/src/kosong/chat_provider/kimi.py
_is_nvidia = "integrate.api.nvidia.com" in self._base_url_lower
_is_kimi = (
base_url_host_matches(self.base_url, "api.kimi.com")
or base_url_host_matches(self.base_url, "moonshot.ai")
or base_url_host_matches(self.base_url, "moonshot.cn")
)
if _is_kimi:
_kimi_thinking_enabled = True
if self.reasoning_config and isinstance(self.reasoning_config, dict):
if self.reasoning_config.get("enabled") is False:
_kimi_thinking_enabled = False
extra_body["thinking"] = {
"type": "enabled" if _kimi_thinking_enabled else "disabled",
# Temperature: _fixed_temperature_for_model may return OMIT_TEMPERATURE
# sentinel (temperature omitted entirely), a numeric override, or None.
try:
from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE
_ft = _fixed_temperature_for_model(self.model, self.base_url)
_omit_temp = _ft is OMIT_TEMPERATURE
_fixed_temp = _ft if not _omit_temp else None
except Exception:
_omit_temp = False
_fixed_temp = None
# Provider preferences (OpenRouter-specific)
_prefs: Dict[str, Any] = {}
if self.providers_allowed:
_prefs["only"] = self.providers_allowed
if self.providers_ignored:
_prefs["ignore"] = self.providers_ignored
if self.providers_order:
_prefs["order"] = self.providers_order
if self.provider_sort:
_prefs["sort"] = self.provider_sort
if self.provider_require_parameters:
_prefs["require_parameters"] = True
if self.provider_data_collection:
_prefs["data_collection"] = self.provider_data_collection
# Anthropic max output for Claude on OpenRouter/Nous
_ant_max = None
if (_is_or or _is_nous) and "claude" in (self.model or "").lower():
try:
from agent.anthropic_adapter import _get_anthropic_max_output
_ant_max = _get_anthropic_max_output(self.model)
except Exception:
pass # fail open — let the proxy pick its default
# Qwen session metadata precomputed here (promptId is per-call random)
_qwen_meta = None
if _is_qwen:
_qwen_meta = {
"sessionId": self.session_id or "hermes",
"promptId": str(uuid.uuid4()),
}
if self._supports_reasoning_extra_body():
if _is_github_models:
github_reasoning = self._github_models_reasoning_extra_body()
if github_reasoning is not None:
extra_body["reasoning"] = github_reasoning
else:
if self.reasoning_config is not None:
rc = dict(self.reasoning_config)
# Nous Portal requires reasoning enabled — don't send
# enabled=false to it (would cause 400).
if _is_nous and rc.get("enabled") is False:
pass # omit reasoning entirely for Nous when disabled
else:
extra_body["reasoning"] = rc
else:
extra_body["reasoning"] = {
"enabled": True,
"effort": "medium"
}
# Ephemeral max output override — consume immediately so the next
# turn doesn't inherit it.
_ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None)
if _ephemeral_out is not None:
self._ephemeral_max_output_tokens = None
# Nous Portal product attribution
if _is_nous:
extra_body["tags"] = ["product=hermes-agent"]
# Ollama num_ctx: override the 2048 default so the model actually
# uses the context window it was trained for. Passed via the OpenAI
# SDK's extra_body → options.num_ctx, which Ollama's OpenAI-compat
# endpoint forwards to the runner as --ctx-size.
if self._ollama_num_ctx:
options = extra_body.get("options", {})
options["num_ctx"] = self._ollama_num_ctx
extra_body["options"] = options
# Ollama / custom provider: pass think=false when reasoning is disabled.
# Ollama does not recognise the OpenRouter-style `reasoning` extra_body
# field, so we use its native `think` parameter instead.
# This prevents thinking-capable models (Qwen3, etc.) from generating
# <think> blocks and producing empty-response errors when the user has
# set reasoning_effort: none.
if self.provider == "custom" and self.reasoning_config and isinstance(self.reasoning_config, dict):
_effort = (self.reasoning_config.get("effort") or "").strip().lower()
_enabled = self.reasoning_config.get("enabled", True)
if _effort == "none" or _enabled is False:
extra_body["think"] = False
if self._is_qwen_portal():
extra_body["vl_high_resolution_images"] = True
if extra_body:
api_kwargs["extra_body"] = extra_body
# Priority Processing / generic request overrides (e.g. service_tier).
# Applied last so overrides win over any defaults set above.
if self.request_overrides:
api_kwargs.update(self.request_overrides)
return api_kwargs
return _ct.build_kwargs(
model=self.model,
messages=api_messages,
tools=self.tools,
timeout=self._resolved_api_call_timeout(),
max_tokens=self.max_tokens,
ephemeral_max_output_tokens=_ephemeral_out,
max_tokens_param_fn=self._max_tokens_param,
reasoning_config=self.reasoning_config,
request_overrides=self.request_overrides,
session_id=getattr(self, "session_id", None),
model_lower=(self.model or "").lower(),
is_openrouter=_is_or,
is_nous=_is_nous,
is_qwen_portal=_is_qwen,
is_github_models=_is_gh,
is_nvidia_nim=_is_nvidia,
is_kimi=_is_kimi,
is_custom_provider=self.provider == "custom",
ollama_num_ctx=self._ollama_num_ctx,
provider_preferences=_prefs or None,
qwen_prepare_fn=self._qwen_prepare_chat_messages if _is_qwen else None,
qwen_prepare_inplace_fn=self._qwen_prepare_chat_messages_inplace if _is_qwen else None,
qwen_session_metadata=_qwen_meta,
fixed_temperature=_fixed_temp,
omit_temperature=_omit_temp,
supports_reasoning=self._supports_reasoning_extra_body(),
github_reasoning_extra=self._github_models_reasoning_extra_body() if _is_gh else None,
anthropic_max_output=_ant_max,
)
def _supports_reasoning_extra_body(self) -> bool:
"""Return True when reasoning extra_body is safe to send for this route/model.
@ -7428,7 +7227,7 @@ class AIAgent:
if not _aux_available and self.api_mode == "codex_responses":
# No auxiliary client -- use the Codex Responses path directly
codex_kwargs = self._build_api_kwargs(api_messages)
codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
codex_kwargs["tools"] = self._get_codex_transport().convert_tools([memory_tool_def])
if _flush_temperature is not None:
codex_kwargs["temperature"] = _flush_temperature
else:
@ -7463,9 +7262,15 @@ class AIAgent:
# Extract tool calls from the response, handling all API formats
tool_calls = []
if self.api_mode == "codex_responses" and not _aux_available:
assistant_msg, _ = self._normalize_codex_response(response)
if assistant_msg and assistant_msg.tool_calls:
tool_calls = assistant_msg.tool_calls
_ct_flush = self._get_codex_transport()
_cnr_flush = _ct_flush.normalize_response(response)
if _cnr_flush and _cnr_flush.tool_calls:
tool_calls = [
SimpleNamespace(
id=tc.id, type="function",
function=SimpleNamespace(name=tc.name, arguments=tc.arguments),
) for tc in _cnr_flush.tool_calls
]
elif self.api_mode == "anthropic_messages" and not _aux_available:
_tfn = self._get_anthropic_transport()
_flush_nr = _tfn.normalize_response(response, strip_tool_prefix=self._is_anthropic_oauth)
@ -7635,8 +7440,27 @@ class AIAgent:
finally:
self._executing_tools = False
def _dispatch_delegate_task(self, function_args: dict) -> str:
"""Single call site for delegate_task dispatch.
New DELEGATE_TASK_SCHEMA fields only need to be added here to reach all
invocation paths (concurrent, sequential, inline).
"""
from tools.delegate_tool import delegate_task as _delegate_task
return _delegate_task(
goal=function_args.get("goal"),
context=function_args.get("context"),
toolsets=function_args.get("toolsets"),
tasks=function_args.get("tasks"),
max_iterations=function_args.get("max_iterations"),
acp_command=function_args.get("acp_command"),
acp_args=function_args.get("acp_args"),
role=function_args.get("role"),
parent_agent=self,
)
def _invoke_tool(self, function_name: str, function_args: dict, effective_task_id: str,
tool_call_id: Optional[str] = None) -> str:
tool_call_id: Optional[str] = None, messages: list = None) -> str:
"""Invoke a single tool and return the result string. No display logic.
Handles both agent-level tools (todo, memory, etc.) and registry-dispatched
@ -7704,15 +7528,7 @@ class AIAgent:
callback=self.clarify_callback,
)
elif function_name == "delegate_task":
from tools.delegate_tool import delegate_task as _delegate_task
return _delegate_task(
goal=function_args.get("goal"),
context=function_args.get("context"),
toolsets=function_args.get("toolsets"),
tasks=function_args.get("tasks"),
max_iterations=function_args.get("max_iterations"),
parent_agent=self,
)
return self._dispatch_delegate_task(function_args)
else:
return handle_function_call(
function_name, function_args, effective_task_id,
@ -7874,7 +7690,7 @@ class AIAgent:
pass
start = time.time()
try:
result = self._invoke_tool(function_name, function_args, effective_task_id, tool_call.id)
result = self._invoke_tool(function_name, function_args, effective_task_id, tool_call.id, messages=messages)
except Exception as tool_error:
result = f"Error executing tool '{function_name}': {tool_error}"
logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
@ -8226,7 +8042,6 @@ class AIAgent:
if self._should_emit_quiet_tool_messages():
self._vprint(f" {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}")
elif function_name == "delegate_task":
from tools.delegate_tool import delegate_task as _delegate_task
tasks_arg = function_args.get("tasks")
if tasks_arg and isinstance(tasks_arg, list):
spinner_label = f"🔀 delegating {len(tasks_arg)} tasks"
@ -8241,14 +8056,7 @@ class AIAgent:
self._delegate_spinner = spinner
_delegate_result = None
try:
function_result = _delegate_task(
goal=function_args.get("goal"),
context=function_args.get("context"),
toolsets=function_args.get("toolsets"),
tasks=tasks_arg,
max_iterations=function_args.get("max_iterations"),
parent_agent=self,
)
function_result = self._dispatch_delegate_task(function_args)
_delegate_result = function_result
finally:
self._delegate_spinner = None
@ -8506,8 +8314,9 @@ class AIAgent:
codex_kwargs = self._build_api_kwargs(api_messages)
codex_kwargs.pop("tools", None)
summary_response = self._run_codex_stream(codex_kwargs)
assistant_message, _ = self._normalize_codex_response(summary_response)
final_response = (assistant_message.content or "").strip() if assistant_message else ""
_ct_sum = self._get_codex_transport()
_cnr_sum = _ct_sum.normalize_response(summary_response)
final_response = (_cnr_sum.content or "").strip()
else:
summary_kwargs = {
"model": self.model,
@ -8564,8 +8373,9 @@ class AIAgent:
codex_kwargs = self._build_api_kwargs(api_messages)
codex_kwargs.pop("tools", None)
retry_response = self._run_codex_stream(codex_kwargs)
retry_msg, _ = self._normalize_codex_response(retry_response)
final_response = (retry_msg.content or "").strip() if retry_msg else ""
_ct_retry = self._get_codex_transport()
_cnr_retry = _ct_retry.normalize_response(retry_response)
final_response = (_cnr_retry.content or "").strip()
elif self.api_mode == "anthropic_messages":
_tretry = self._get_anthropic_transport()
_ant_kw2 = _tretry.build_kwargs(model=self.model, messages=api_messages, tools=None,
@ -8676,6 +8486,11 @@ class AIAgent:
self._persist_user_message_override = persist_user_message
# Generate unique task_id if not provided to isolate VMs between concurrent tasks
effective_task_id = task_id or str(uuid.uuid4())
# Expose the active task_id so tools running mid-turn (e.g. delegate_task
# in delegate_tool.py) can identify this agent for the cross-agent file
# state registry. Set BEFORE any tool dispatch so snapshots taken at
# child-launch time see the parent's real id, not None.
self._current_task_id = effective_task_id
# Reset retry counters and iteration budget at the start of each turn
# so subagent usage from a previous turn doesn't eat into the next one.
@ -9322,7 +9137,7 @@ class AIAgent:
if self._force_ascii_payload:
_sanitize_structure_non_ascii(api_kwargs)
if self.api_mode == "codex_responses":
api_kwargs = self._preflight_codex_api_kwargs(api_kwargs, allow_stream=False)
api_kwargs = self._get_codex_transport().preflight_kwargs(api_kwargs, allow_stream=False)
try:
from hermes_cli.plugins import invoke_hook as _invoke_hook
@ -9410,38 +9225,34 @@ class AIAgent:
response_invalid = False
error_details = []
if self.api_mode == "codex_responses":
output_items = getattr(response, "output", None) if response is not None else None
if response is None:
response_invalid = True
error_details.append("response is None")
elif not isinstance(output_items, list):
response_invalid = True
error_details.append("response.output is not a list")
elif not output_items:
# Stream backfill may have failed, but
# _normalize_codex_response can still recover
# from response.output_text. Only mark invalid
# when that fallback is also absent.
_out_text = getattr(response, "output_text", None)
_out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else ""
if _out_text_stripped:
logger.debug(
"Codex response.output is empty but output_text is present "
"(%d chars); deferring to normalization.",
len(_out_text_stripped),
)
else:
_resp_status = getattr(response, "status", None)
_resp_incomplete = getattr(response, "incomplete_details", None)
logger.warning(
"Codex response.output is empty after stream backfill "
"(status=%s, incomplete_details=%s, model=%s). %s",
_resp_status, _resp_incomplete,
getattr(response, "model", None),
f"api_mode={self.api_mode} provider={self.provider}",
)
_ct_v = self._get_codex_transport()
if not _ct_v.validate_response(response):
if response is None:
response_invalid = True
error_details.append("response.output is empty")
error_details.append("response is None")
else:
# output_text fallback: stream backfill may have failed
# but normalize can still recover from output_text
_out_text = getattr(response, "output_text", None)
_out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else ""
if _out_text_stripped:
logger.debug(
"Codex response.output is empty but output_text is present "
"(%d chars); deferring to normalization.",
len(_out_text_stripped),
)
else:
_resp_status = getattr(response, "status", None)
_resp_incomplete = getattr(response, "incomplete_details", None)
logger.warning(
"Codex response.output is empty after stream backfill "
"(status=%s, incomplete_details=%s, model=%s). %s",
_resp_status, _resp_incomplete,
getattr(response, "model", None),
f"api_mode={self.api_mode} provider={self.provider}",
)
response_invalid = True
error_details.append("response.output is empty")
elif self.api_mode == "anthropic_messages":
_tv = self._get_anthropic_transport()
if not _tv.validate_response(response):
@ -9450,8 +9261,17 @@ class AIAgent:
error_details.append("response is None")
else:
error_details.append("response.content invalid (not a non-empty list)")
elif self.api_mode == "bedrock_converse":
_btv = self._get_bedrock_transport()
if not _btv.validate_response(response):
response_invalid = True
if response is None:
error_details.append("response is None")
else:
error_details.append("Bedrock response invalid (no output or choices)")
else:
if response is None or not hasattr(response, 'choices') or response.choices is None or not response.choices:
_ctv = self._get_chat_completions_transport()
if not _ctv.validate_response(response):
response_invalid = True
if response is None:
error_details.append("response is None")
@ -9612,6 +9432,10 @@ class AIAgent:
elif self.api_mode == "anthropic_messages":
_tfr = self._get_anthropic_transport()
finish_reason = _tfr.map_finish_reason(response.stop_reason)
elif self.api_mode == "bedrock_converse":
# Bedrock response is already normalized at dispatch — finish_reason
# is already in OpenAI format via normalize_converse_response()
finish_reason = response.choices[0].finish_reason if hasattr(response, "choices") and response.choices else "stop"
else:
finish_reason = response.choices[0].finish_reason
assistant_message = response.choices[0].message
@ -10867,7 +10691,40 @@ class AIAgent:
try:
if self.api_mode == "codex_responses":
assistant_message, finish_reason = self._normalize_codex_response(response)
_ct = self._get_codex_transport()
_cnr = _ct.normalize_response(response)
# Back-compat shim: downstream expects SimpleNamespace with
# codex-specific fields (.codex_reasoning_items, .reasoning_details,
# and .call_id/.response_item_id on tool calls).
_tc_list = None
if _cnr.tool_calls:
_tc_list = []
for tc in _cnr.tool_calls:
_tc_ns = SimpleNamespace(
id=tc.id, type="function",
function=SimpleNamespace(name=tc.name, arguments=tc.arguments),
)
if tc.provider_data:
if tc.provider_data.get("call_id"):
_tc_ns.call_id = tc.provider_data["call_id"]
if tc.provider_data.get("response_item_id"):
_tc_ns.response_item_id = tc.provider_data["response_item_id"]
_tc_list.append(_tc_ns)
assistant_message = SimpleNamespace(
content=_cnr.content,
tool_calls=_tc_list or None,
reasoning=_cnr.reasoning,
reasoning_content=None,
codex_reasoning_items=(
_cnr.provider_data.get("codex_reasoning_items")
if _cnr.provider_data else None
),
reasoning_details=(
_cnr.provider_data.get("reasoning_details")
if _cnr.provider_data else None
),
)
finish_reason = _cnr.finish_reason
elif self.api_mode == "anthropic_messages":
_transport = self._get_anthropic_transport()
_nr = _transport.normalize_response(