mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Merge branch 'main' of github.com:NousResearch/hermes-agent into feat/ink-refactor
This commit is contained in:
commit
2aea75e91e
131 changed files with 12350 additions and 1164 deletions
348
run_agent.py
348
run_agent.py
|
|
@ -94,7 +94,7 @@ from agent.model_metadata import (
|
|||
from agent.context_compressor import ContextCompressor
|
||||
from agent.subdirectory_hints import SubdirectoryHintTracker
|
||||
from agent.prompt_caching import apply_anthropic_cache_control
|
||||
from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt, load_soul_md, TOOL_USE_ENFORCEMENT_GUIDANCE, TOOL_USE_ENFORCEMENT_MODELS, DEVELOPER_ROLE_MODELS, GOOGLE_MODEL_OPERATIONAL_GUIDANCE, OPENAI_MODEL_EXECUTION_GUIDANCE
|
||||
from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt, build_environment_hints, load_soul_md, TOOL_USE_ENFORCEMENT_GUIDANCE, TOOL_USE_ENFORCEMENT_MODELS, DEVELOPER_ROLE_MODELS, GOOGLE_MODEL_OPERATIONAL_GUIDANCE, OPENAI_MODEL_EXECUTION_GUIDANCE
|
||||
from agent.usage_pricing import estimate_usage_cost, normalize_usage
|
||||
from agent.display import (
|
||||
KawaiiSpinner, build_tool_preview as _build_tool_preview,
|
||||
|
|
@ -339,10 +339,7 @@ def _paths_overlap(left: Path, right: Path) -> bool:
|
|||
|
||||
_SURROGATE_RE = re.compile(r'[\ud800-\udfff]')
|
||||
|
||||
_BUDGET_WARNING_RE = re.compile(
|
||||
r"\[BUDGET(?:\s+WARNING)?:\s+Iteration\s+\d+/\d+\..*?\]",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
|
||||
def _sanitize_surrogates(text: str) -> str:
|
||||
|
|
@ -463,34 +460,7 @@ def _sanitize_messages_non_ascii(messages: list) -> bool:
|
|||
return found
|
||||
|
||||
|
||||
def _strip_budget_warnings_from_history(messages: list) -> None:
|
||||
"""Remove budget pressure warnings from tool-result messages in-place.
|
||||
|
||||
Budget warnings are turn-scoped signals that must not leak into replayed
|
||||
history. They live in tool-result ``content`` either as a JSON key
|
||||
(``_budget_warning``) or appended plain text.
|
||||
"""
|
||||
for msg in messages:
|
||||
if not isinstance(msg, dict) or msg.get("role") != "tool":
|
||||
continue
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, str) or "_budget_warning" not in content and "[BUDGET" not in content:
|
||||
continue
|
||||
|
||||
# Try JSON first (the common case: _budget_warning key in a dict)
|
||||
try:
|
||||
parsed = json.loads(content)
|
||||
if isinstance(parsed, dict) and "_budget_warning" in parsed:
|
||||
del parsed["_budget_warning"]
|
||||
msg["content"] = json.dumps(parsed, ensure_ascii=False)
|
||||
continue
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
# Fallback: strip the text pattern from plain-text tool results
|
||||
cleaned = _BUDGET_WARNING_RE.sub("", content).strip()
|
||||
if cleaned != content:
|
||||
msg["content"] = cleaned
|
||||
|
||||
|
||||
# =========================================================================
|
||||
|
|
@ -579,6 +549,7 @@ class AIAgent:
|
|||
clarify_callback: callable = None,
|
||||
step_callback: callable = None,
|
||||
stream_delta_callback: callable = None,
|
||||
interim_assistant_callback: callable = None,
|
||||
tool_gen_callback: callable = None,
|
||||
status_callback: callable = None,
|
||||
max_tokens: int = None,
|
||||
|
|
@ -728,6 +699,7 @@ class AIAgent:
|
|||
self.clarify_callback = clarify_callback
|
||||
self.step_callback = step_callback
|
||||
self.stream_delta_callback = stream_delta_callback
|
||||
self.interim_assistant_callback = interim_assistant_callback
|
||||
self.status_callback = status_callback
|
||||
self.tool_gen_callback = tool_gen_callback
|
||||
|
||||
|
|
@ -775,12 +747,14 @@ class AIAgent:
|
|||
self._use_prompt_caching = (is_openrouter and is_claude) or is_native_anthropic
|
||||
self._cache_ttl = "5m" # Default 5-minute TTL (1.25x write cost)
|
||||
|
||||
# Iteration budget pressure: warn the LLM as it approaches max_iterations.
|
||||
# Warnings are injected into the last tool result JSON (not as separate
|
||||
# messages) so they don't break message structure or invalidate caching.
|
||||
self._budget_caution_threshold = 0.7 # 70% — nudge to start wrapping up
|
||||
self._budget_warning_threshold = 0.9 # 90% — urgent, respond now
|
||||
self._budget_pressure_enabled = True
|
||||
# Iteration budget: the LLM is only notified when it actually exhausts
|
||||
# the iteration budget (api_call_count >= max_iterations). At that
|
||||
# point we inject ONE message, allow one final API call, and if the
|
||||
# model doesn't produce a text response, force a user-message asking
|
||||
# it to summarise. No intermediate pressure warnings — they caused
|
||||
# models to "give up" prematurely on complex tasks (#7915).
|
||||
self._budget_exhausted_injected = False
|
||||
self._budget_grace_call = False
|
||||
|
||||
# Context pressure warnings: notify the USER (not the LLM) as context
|
||||
# fills up. Purely informational — displayed in CLI output and sent via
|
||||
|
|
@ -831,6 +805,11 @@ class AIAgent:
|
|||
# Deferred paragraph break flag — set after tool iterations so a
|
||||
# single "\n\n" is prepended to the next real text delta.
|
||||
self._stream_needs_break = False
|
||||
# Visible assistant text already delivered through live token callbacks
|
||||
# during the current model response. Used to avoid re-sending the same
|
||||
# commentary when the provider later returns it as a completed interim
|
||||
# assistant message.
|
||||
self._current_streamed_assistant_text = ""
|
||||
|
||||
# Optional current-turn user-message override used when the API-facing
|
||||
# user message intentionally differs from the persisted transcript
|
||||
|
|
@ -1328,9 +1307,23 @@ class AIAgent:
|
|||
api_key=getattr(self, "api_key", ""),
|
||||
config_context_length=_config_context_length,
|
||||
provider=self.provider,
|
||||
api_mode=self.api_mode,
|
||||
)
|
||||
self.compression_enabled = compression_enabled
|
||||
|
||||
# Reject models whose context window is below the minimum required
|
||||
# for reliable tool-calling workflows (64K tokens).
|
||||
from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
|
||||
_ctx = getattr(self.context_compressor, "context_length", 0)
|
||||
if _ctx and _ctx < MINIMUM_CONTEXT_LENGTH:
|
||||
raise ValueError(
|
||||
f"Model {self.model} has a context window of {_ctx:,} tokens, "
|
||||
f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required "
|
||||
f"by Hermes Agent. Choose a model with at least "
|
||||
f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set "
|
||||
f"model.context_length in config.yaml to override."
|
||||
)
|
||||
|
||||
# Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand)
|
||||
self._context_engine_tool_names: set = set()
|
||||
if hasattr(self, "context_compressor") and self.context_compressor and self.tools is not None:
|
||||
|
|
@ -1571,6 +1564,7 @@ class AIAgent:
|
|||
base_url=self.base_url,
|
||||
api_key=getattr(self, "api_key", ""),
|
||||
provider=self.provider,
|
||||
api_mode=self.api_mode,
|
||||
)
|
||||
|
||||
# ── Invalidate cached system prompt so it rebuilds next turn ──
|
||||
|
|
@ -1704,6 +1698,16 @@ class AIAgent:
|
|||
except Exception:
|
||||
logger.debug("status_callback error in _emit_status", exc_info=True)
|
||||
|
||||
def _current_main_runtime(self) -> Dict[str, str]:
|
||||
"""Return the live main runtime for session-scoped auxiliary routing."""
|
||||
return {
|
||||
"model": getattr(self, "model", "") or "",
|
||||
"provider": getattr(self, "provider", "") or "",
|
||||
"base_url": getattr(self, "base_url", "") or "",
|
||||
"api_key": getattr(self, "api_key", "") or "",
|
||||
"api_mode": getattr(self, "api_mode", "") or "",
|
||||
}
|
||||
|
||||
def _check_compression_model_feasibility(self) -> None:
|
||||
"""Warn at session start if the auxiliary compression model's context
|
||||
window is smaller than the main model's compression threshold.
|
||||
|
|
@ -1724,7 +1728,10 @@ class AIAgent:
|
|||
from agent.auxiliary_client import get_text_auxiliary_client
|
||||
from agent.model_metadata import get_model_context_length
|
||||
|
||||
client, aux_model = get_text_auxiliary_client("compression")
|
||||
client, aux_model = get_text_auxiliary_client(
|
||||
"compression",
|
||||
main_runtime=self._current_main_runtime(),
|
||||
)
|
||||
if client is None or not aux_model:
|
||||
msg = (
|
||||
"⚠ No auxiliary LLM provider configured — context "
|
||||
|
|
@ -3186,11 +3193,17 @@ class AIAgent:
|
|||
f"not on any model name returned by the API."
|
||||
)
|
||||
|
||||
# Environment hints (WSL, Termux, etc.) — tell the agent about the
|
||||
# execution environment so it can translate paths and adapt behavior.
|
||||
_env_hints = build_environment_hints()
|
||||
if _env_hints:
|
||||
prompt_parts.append(_env_hints)
|
||||
|
||||
platform_key = (self.platform or "").lower().strip()
|
||||
if platform_key in PLATFORM_HINTS:
|
||||
prompt_parts.append(PLATFORM_HINTS[platform_key])
|
||||
|
||||
return "\n\n".join(prompt_parts)
|
||||
return "\n\n".join(p.strip() for p in prompt_parts if p.strip())
|
||||
|
||||
# =========================================================================
|
||||
# Pre/post-call guardrails (inspired by PR #1321 — @alireza78a)
|
||||
|
|
@ -4730,6 +4743,49 @@ class AIAgent:
|
|||
|
||||
# ── Unified streaming API call ─────────────────────────────────────────
|
||||
|
||||
def _reset_stream_delivery_tracking(self) -> None:
|
||||
"""Reset tracking for text delivered during the current model response."""
|
||||
self._current_streamed_assistant_text = ""
|
||||
|
||||
def _record_streamed_assistant_text(self, text: str) -> None:
|
||||
"""Accumulate visible assistant text emitted through stream callbacks."""
|
||||
if isinstance(text, str) and text:
|
||||
self._current_streamed_assistant_text = (
|
||||
getattr(self, "_current_streamed_assistant_text", "") + text
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_interim_visible_text(text: str) -> str:
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
def _interim_content_was_streamed(self, content: str) -> bool:
|
||||
visible_content = self._normalize_interim_visible_text(
|
||||
self._strip_think_blocks(content or "")
|
||||
)
|
||||
if not visible_content:
|
||||
return False
|
||||
streamed = self._normalize_interim_visible_text(
|
||||
self._strip_think_blocks(getattr(self, "_current_streamed_assistant_text", "") or "")
|
||||
)
|
||||
return bool(streamed) and streamed == visible_content
|
||||
|
||||
def _emit_interim_assistant_message(self, assistant_msg: Dict[str, Any]) -> None:
|
||||
"""Surface a real mid-turn assistant commentary message to the UI layer."""
|
||||
cb = getattr(self, "interim_assistant_callback", None)
|
||||
if cb is None or not isinstance(assistant_msg, dict):
|
||||
return
|
||||
content = assistant_msg.get("content")
|
||||
visible = self._strip_think_blocks(content or "").strip()
|
||||
if not visible or visible == "(empty)":
|
||||
return
|
||||
already_streamed = self._interim_content_was_streamed(visible)
|
||||
try:
|
||||
cb(visible, already_streamed=already_streamed)
|
||||
except Exception:
|
||||
logger.debug("interim_assistant_callback error", exc_info=True)
|
||||
|
||||
def _fire_stream_delta(self, text: str) -> None:
|
||||
"""Fire all registered stream delta callbacks (display + TTS)."""
|
||||
# If a tool iteration set the break flag, prepend a single paragraph
|
||||
|
|
@ -4739,12 +4795,16 @@ class AIAgent:
|
|||
if getattr(self, "_stream_needs_break", False) and text and text.strip():
|
||||
self._stream_needs_break = False
|
||||
text = "\n\n" + text
|
||||
for cb in (self.stream_delta_callback, self._stream_callback):
|
||||
if cb is not None:
|
||||
try:
|
||||
cb(text)
|
||||
except Exception:
|
||||
pass
|
||||
callbacks = [cb for cb in (self.stream_delta_callback, self._stream_callback) if cb is not None]
|
||||
delivered = False
|
||||
for cb in callbacks:
|
||||
try:
|
||||
cb(text)
|
||||
delivered = True
|
||||
except Exception:
|
||||
pass
|
||||
if delivered:
|
||||
self._record_streamed_assistant_text(text)
|
||||
|
||||
def _fire_reasoning_delta(self, text: str) -> None:
|
||||
"""Fire reasoning callback if registered."""
|
||||
|
|
@ -4928,6 +4988,7 @@ class AIAgent:
|
|||
if self.stream_delta_callback:
|
||||
try:
|
||||
self.stream_delta_callback(delta.content)
|
||||
self._record_streamed_assistant_text(delta.content)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
@ -6509,17 +6570,23 @@ class AIAgent:
|
|||
if messages and messages[-1].get("_flush_sentinel") == _sentinel:
|
||||
messages.pop()
|
||||
|
||||
def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default") -> tuple:
|
||||
def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default", focus_topic: str = None) -> tuple:
|
||||
"""Compress conversation context and split the session in SQLite.
|
||||
|
||||
Args:
|
||||
focus_topic: Optional focus string for guided compression — the
|
||||
summariser will prioritise preserving information related to
|
||||
this topic. Inspired by Claude Code's ``/compact <focus>``.
|
||||
|
||||
Returns:
|
||||
(compressed_messages, new_system_prompt) tuple
|
||||
"""
|
||||
_pre_msg_count = len(messages)
|
||||
logger.info(
|
||||
"context compression started: session=%s messages=%d tokens=~%s model=%s",
|
||||
"context compression started: session=%s messages=%d tokens=~%s model=%s focus=%r",
|
||||
self.session_id or "none", _pre_msg_count,
|
||||
f"{approx_tokens:,}" if approx_tokens else "unknown", self.model,
|
||||
focus_topic,
|
||||
)
|
||||
# Pre-compression memory flush: let the model save memories before they're lost
|
||||
self.flush_memories(messages, min_turns=0)
|
||||
|
|
@ -6531,7 +6598,7 @@ class AIAgent:
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens)
|
||||
compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens, focus_topic=focus_topic)
|
||||
|
||||
todo_snapshot = self._todo_store.format_for_injection()
|
||||
if todo_snapshot:
|
||||
|
|
@ -6920,24 +6987,6 @@ class AIAgent:
|
|||
turn_tool_msgs = messages[-num_tools:]
|
||||
enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id))
|
||||
|
||||
# ── Budget pressure injection ────────────────────────────────────
|
||||
budget_warning = self._get_budget_warning(api_call_count)
|
||||
if budget_warning and messages and messages[-1].get("role") == "tool":
|
||||
last_content = messages[-1]["content"]
|
||||
try:
|
||||
parsed = json.loads(last_content)
|
||||
if isinstance(parsed, dict):
|
||||
parsed["_budget_warning"] = budget_warning
|
||||
messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False)
|
||||
else:
|
||||
messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
|
||||
if not self.quiet_mode:
|
||||
remaining = self.max_iterations - api_call_count
|
||||
tier = "⚠️ WARNING" if remaining <= self.max_iterations * 0.1 else "💡 CAUTION"
|
||||
print(f"{self.log_prefix}{tier}: {remaining} iterations remaining")
|
||||
|
||||
def _execute_tool_calls_sequential(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
|
||||
"""Execute tool calls sequentially (original behavior). Used for single calls or interactive tools."""
|
||||
for i, tool_call in enumerate(assistant_message.tool_calls, 1):
|
||||
|
|
@ -6986,6 +7035,15 @@ class AIAgent:
|
|||
self._current_tool = function_name
|
||||
self._touch_activity(f"executing tool: {function_name}")
|
||||
|
||||
# Set activity callback for long-running tool execution (terminal
|
||||
# commands, etc.) so the gateway's inactivity monitor doesn't kill
|
||||
# the agent while a command is running.
|
||||
try:
|
||||
from tools.environments.base import set_activity_callback
|
||||
set_activity_callback(self._touch_activity)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if self.tool_progress_callback:
|
||||
try:
|
||||
preview = _build_tool_preview(function_name, function_args)
|
||||
|
|
@ -7275,50 +7333,7 @@ class AIAgent:
|
|||
if num_tools_seq > 0:
|
||||
enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id))
|
||||
|
||||
# ── Budget pressure injection ─────────────────────────────────
|
||||
# After all tool calls in this turn are processed, check if we're
|
||||
# approaching max_iterations. If so, inject a warning into the LAST
|
||||
# tool result's JSON so the LLM sees it naturally when reading results.
|
||||
budget_warning = self._get_budget_warning(api_call_count)
|
||||
if budget_warning and messages and messages[-1].get("role") == "tool":
|
||||
last_content = messages[-1]["content"]
|
||||
try:
|
||||
parsed = json.loads(last_content)
|
||||
if isinstance(parsed, dict):
|
||||
parsed["_budget_warning"] = budget_warning
|
||||
messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False)
|
||||
else:
|
||||
messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
|
||||
if not self.quiet_mode:
|
||||
remaining = self.max_iterations - api_call_count
|
||||
tier = "⚠️ WARNING" if remaining <= self.max_iterations * 0.1 else "💡 CAUTION"
|
||||
print(f"{self.log_prefix}{tier}: {remaining} iterations remaining")
|
||||
|
||||
def _get_budget_warning(self, api_call_count: int) -> Optional[str]:
|
||||
"""Return a budget pressure string, or None if not yet needed.
|
||||
|
||||
Two-tier system:
|
||||
- Caution (70%): nudge to consolidate work
|
||||
- Warning (90%): urgent, must respond now
|
||||
"""
|
||||
if not self._budget_pressure_enabled or self.max_iterations <= 0:
|
||||
return None
|
||||
progress = api_call_count / self.max_iterations
|
||||
remaining = self.max_iterations - api_call_count
|
||||
if progress >= self._budget_warning_threshold:
|
||||
return (
|
||||
f"[BUDGET WARNING: Iteration {api_call_count}/{self.max_iterations}. "
|
||||
f"Only {remaining} iteration(s) left. "
|
||||
"Provide your final response NOW. No more tool calls unless absolutely critical.]"
|
||||
)
|
||||
if progress >= self._budget_caution_threshold:
|
||||
return (
|
||||
f"[BUDGET: Iteration {api_call_count}/{self.max_iterations}. "
|
||||
f"{remaining} iterations left. Start consolidating your work.]"
|
||||
)
|
||||
return None
|
||||
|
||||
def _emit_context_pressure(self, compaction_progress: float, compressor) -> None:
|
||||
"""Notify the user that context is approaching the compaction threshold.
|
||||
|
|
@ -7542,6 +7557,11 @@ class AIAgent:
|
|||
# Installed once, transparent when streams are healthy, prevents crash on write.
|
||||
_install_safe_stdio()
|
||||
|
||||
# Tag all log records on this thread with the session ID so
|
||||
# ``hermes logs --session <id>`` can filter a single conversation.
|
||||
from hermes_logging import set_session_context
|
||||
set_session_context(self.session_id)
|
||||
|
||||
# If the previous turn activated fallback, restore the primary
|
||||
# runtime so this turn gets a fresh attempt with the preferred model.
|
||||
# No-op when _fallback_activated is False (gateway, first turn, etc.).
|
||||
|
|
@ -7611,14 +7631,6 @@ class AIAgent:
|
|||
# Initialize conversation (copy to avoid mutating the caller's list)
|
||||
messages = list(conversation_history) if conversation_history else []
|
||||
|
||||
# Strip budget pressure warnings from previous turns. These are
|
||||
# turn-scoped signals injected by _get_budget_warning() into tool
|
||||
# result content. If left in the replayed history, models (especially
|
||||
# GPT-family) interpret them as still-active instructions and avoid
|
||||
# making tool calls in ALL subsequent turns.
|
||||
if messages:
|
||||
_strip_budget_warnings_from_history(messages)
|
||||
|
||||
# Hydrate todo store from conversation history (gateway creates a fresh
|
||||
# AIAgent per message, so the in-memory store is empty -- we need to
|
||||
# recover the todo state from the most recent todo tool response in history)
|
||||
|
|
@ -7835,7 +7847,7 @@ class AIAgent:
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
while api_call_count < self.max_iterations and self.iteration_budget.remaining > 0:
|
||||
while (api_call_count < self.max_iterations and self.iteration_budget.remaining > 0) or self._budget_grace_call:
|
||||
# Reset per-turn checkpoint dedup so each iteration can take one snapshot
|
||||
self._checkpoint_mgr.new_turn()
|
||||
|
||||
|
|
@ -7850,7 +7862,13 @@ class AIAgent:
|
|||
api_call_count += 1
|
||||
self._api_call_count = api_call_count
|
||||
self._touch_activity(f"starting API call #{api_call_count}")
|
||||
if not self.iteration_budget.consume():
|
||||
|
||||
# Grace call: the budget is exhausted but we gave the model one
|
||||
# more chance. Consume the grace flag so the loop exits after
|
||||
# this iteration regardless of outcome.
|
||||
if self._budget_grace_call:
|
||||
self._budget_grace_call = False
|
||||
elif not self.iteration_budget.consume():
|
||||
_turn_exit_reason = "budget_exhausted"
|
||||
if not self.quiet_mode:
|
||||
self._safe_print(f"\n⚠️ Iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} iterations used)")
|
||||
|
|
@ -7977,9 +7995,39 @@ class AIAgent:
|
|||
# manual message manipulation are always caught.
|
||||
api_messages = self._sanitize_api_messages(api_messages)
|
||||
|
||||
# Normalize message whitespace and tool-call JSON for consistent
|
||||
# prefix matching. Ensures bit-perfect prefixes across turns,
|
||||
# which enables KV cache reuse on local inference servers
|
||||
# (llama.cpp, vLLM, Ollama) and improves cache hit rates for
|
||||
# cloud providers. Operates on api_messages (the API copy) so
|
||||
# the original conversation history in `messages` is untouched.
|
||||
for am in api_messages:
|
||||
if isinstance(am.get("content"), str):
|
||||
am["content"] = am["content"].strip()
|
||||
for am in api_messages:
|
||||
tcs = am.get("tool_calls")
|
||||
if not tcs:
|
||||
continue
|
||||
new_tcs = []
|
||||
for tc in tcs:
|
||||
if isinstance(tc, dict) and "function" in tc:
|
||||
try:
|
||||
args_obj = json.loads(tc["function"]["arguments"])
|
||||
tc = {**tc, "function": {
|
||||
**tc["function"],
|
||||
"arguments": json.dumps(
|
||||
args_obj, separators=(",", ":"),
|
||||
sort_keys=True,
|
||||
),
|
||||
}}
|
||||
except Exception:
|
||||
pass
|
||||
new_tcs.append(tc)
|
||||
am["tool_calls"] = new_tcs
|
||||
|
||||
# Calculate approximate request size for logging
|
||||
total_chars = sum(len(str(msg)) for msg in api_messages)
|
||||
approx_tokens = total_chars // 4 # Rough estimate: 4 chars per token
|
||||
approx_tokens = estimate_messages_tokens_rough(api_messages)
|
||||
|
||||
# Thinking spinner for quiet mode (animated during API call)
|
||||
thinking_spinner = None
|
||||
|
|
@ -8028,6 +8076,7 @@ class AIAgent:
|
|||
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
self._reset_stream_delivery_tracking()
|
||||
api_kwargs = self._build_api_kwargs(api_messages)
|
||||
if self.api_mode == "codex_responses":
|
||||
api_kwargs = self._preflight_codex_api_kwargs(api_kwargs, allow_stream=False)
|
||||
|
|
@ -8186,6 +8235,8 @@ class AIAgent:
|
|||
self._emit_status("⚠️ Empty/malformed response — switching to fallback...")
|
||||
if self._try_activate_fallback():
|
||||
retry_count = 0
|
||||
compression_attempts = 0
|
||||
primary_recovery_attempted = False
|
||||
continue
|
||||
|
||||
# Check for error field in response (some providers include this)
|
||||
|
|
@ -8221,6 +8272,8 @@ class AIAgent:
|
|||
self._emit_status(f"⚠️ Max retries ({max_retries}) for invalid responses — trying fallback...")
|
||||
if self._try_activate_fallback():
|
||||
retry_count = 0
|
||||
compression_attempts = 0
|
||||
primary_recovery_attempted = False
|
||||
continue
|
||||
self._emit_status(f"❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.")
|
||||
logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.")
|
||||
|
|
@ -8875,6 +8928,8 @@ class AIAgent:
|
|||
self._emit_status("⚠️ Rate limited — switching to fallback provider...")
|
||||
if self._try_activate_fallback():
|
||||
retry_count = 0
|
||||
compression_attempts = 0
|
||||
primary_recovery_attempted = False
|
||||
continue
|
||||
|
||||
is_payload_too_large = (
|
||||
|
|
@ -9088,6 +9143,8 @@ class AIAgent:
|
|||
self._emit_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
|
||||
if self._try_activate_fallback():
|
||||
retry_count = 0
|
||||
compression_attempts = 0
|
||||
primary_recovery_attempted = False
|
||||
continue
|
||||
if api_kwargs is not None:
|
||||
self._dump_api_request_debug(
|
||||
|
|
@ -9153,6 +9210,8 @@ class AIAgent:
|
|||
self._emit_status(f"⚠️ Max retries ({max_retries}) exhausted — trying fallback...")
|
||||
if self._try_activate_fallback():
|
||||
retry_count = 0
|
||||
compression_attempts = 0
|
||||
primary_recovery_attempted = False
|
||||
continue
|
||||
_final_summary = self._summarize_api_error(api_error)
|
||||
if is_rate_limited:
|
||||
|
|
@ -9376,8 +9435,6 @@ class AIAgent:
|
|||
# Check for incomplete <REASONING_SCRATCHPAD> (opened but never closed)
|
||||
# This means the model ran out of output tokens mid-reasoning — retry up to 2 times
|
||||
if has_incomplete_scratchpad(assistant_message.content or ""):
|
||||
if not hasattr(self, '_incomplete_scratchpad_retries'):
|
||||
self._incomplete_scratchpad_retries = 0
|
||||
self._incomplete_scratchpad_retries += 1
|
||||
|
||||
self._vprint(f"{self.log_prefix}⚠️ Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
|
||||
|
|
@ -9405,12 +9462,9 @@ class AIAgent:
|
|||
}
|
||||
|
||||
# Reset incomplete scratchpad counter on clean response
|
||||
if hasattr(self, '_incomplete_scratchpad_retries'):
|
||||
self._incomplete_scratchpad_retries = 0
|
||||
self._incomplete_scratchpad_retries = 0
|
||||
|
||||
if self.api_mode == "codex_responses" and finish_reason == "incomplete":
|
||||
if not hasattr(self, "_codex_incomplete_retries"):
|
||||
self._codex_incomplete_retries = 0
|
||||
self._codex_incomplete_retries += 1
|
||||
|
||||
interim_msg = self._build_assistant_message(assistant_message, finish_reason)
|
||||
|
|
@ -9437,6 +9491,7 @@ class AIAgent:
|
|||
)
|
||||
if not duplicate_interim:
|
||||
messages.append(interim_msg)
|
||||
self._emit_interim_assistant_message(interim_msg)
|
||||
|
||||
if self._codex_incomplete_retries < 3:
|
||||
if not self.quiet_mode:
|
||||
|
|
@ -9481,8 +9536,6 @@ class AIAgent:
|
|||
]
|
||||
if invalid_tool_calls:
|
||||
# Track retries for invalid tool calls
|
||||
if not hasattr(self, '_invalid_tool_retries'):
|
||||
self._invalid_tool_retries = 0
|
||||
self._invalid_tool_retries += 1
|
||||
|
||||
# Return helpful error to model — model can self-correct next turn
|
||||
|
|
@ -9518,8 +9571,7 @@ class AIAgent:
|
|||
})
|
||||
continue
|
||||
# Reset retry counter on successful tool call validation
|
||||
if hasattr(self, '_invalid_tool_retries'):
|
||||
self._invalid_tool_retries = 0
|
||||
self._invalid_tool_retries = 0
|
||||
|
||||
# Validate tool call arguments are valid JSON
|
||||
# Handle empty strings as empty objects (common model quirk)
|
||||
|
|
@ -9659,6 +9711,7 @@ class AIAgent:
|
|||
messages.pop()
|
||||
|
||||
messages.append(assistant_msg)
|
||||
self._emit_interim_assistant_message(assistant_msg)
|
||||
|
||||
# Close any open streaming display (response box, reasoning
|
||||
# box) before tool execution begins. Intermediate turns may
|
||||
|
|
@ -9921,8 +9974,7 @@ class AIAgent:
|
|||
break
|
||||
|
||||
# Reset retry counter/signature on successful content
|
||||
if hasattr(self, '_empty_content_retries'):
|
||||
self._empty_content_retries = 0
|
||||
self._empty_content_retries = 0
|
||||
self._thinking_prefill_retries = 0
|
||||
|
||||
if (
|
||||
|
|
@ -9938,6 +9990,7 @@ class AIAgent:
|
|||
codex_ack_continuations += 1
|
||||
interim_msg = self._build_assistant_message(assistant_message, "incomplete")
|
||||
messages.append(interim_msg)
|
||||
self._emit_interim_assistant_message(interim_msg)
|
||||
|
||||
continue_msg = {
|
||||
"role": "user",
|
||||
|
|
@ -9988,8 +10041,7 @@ class AIAgent:
|
|||
except (OSError, ValueError):
|
||||
logger.error(error_msg)
|
||||
|
||||
if self.verbose_logging:
|
||||
logging.exception("Detailed error information:")
|
||||
logger.debug("Outer loop error in API call #%d", api_call_count, exc_info=True)
|
||||
|
||||
# If an assistant message with tool_calls was already appended,
|
||||
# the API expects a role="tool" result for every tool_call_id.
|
||||
|
|
@ -10035,7 +10087,31 @@ class AIAgent:
|
|||
if final_response is None and (
|
||||
api_call_count >= self.max_iterations
|
||||
or self.iteration_budget.remaining <= 0
|
||||
):
|
||||
) and not self._budget_exhausted_injected:
|
||||
# Budget exhausted but we haven't tried asking the model to
|
||||
# summarise yet. Inject a user message and give it one grace
|
||||
# API call to produce a text response.
|
||||
self._budget_exhausted_injected = True
|
||||
self._budget_grace_call = True
|
||||
_grace_msg = (
|
||||
"Your tool budget ran out. Please give me the information "
|
||||
"or actions you've completed so far."
|
||||
)
|
||||
messages.append({"role": "user", "content": _grace_msg})
|
||||
self._emit_status(
|
||||
f"⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) "
|
||||
"— asking model to summarise"
|
||||
)
|
||||
if not self.quiet_mode:
|
||||
self._safe_print(
|
||||
f"\n⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) "
|
||||
"— requesting summary..."
|
||||
)
|
||||
|
||||
if final_response is None and (
|
||||
api_call_count >= self.max_iterations
|
||||
or self.iteration_budget.remaining <= 0
|
||||
) and not self._budget_grace_call:
|
||||
_turn_exit_reason = f"max_iterations_reached({api_call_count}/{self.max_iterations})"
|
||||
if self.iteration_budget.remaining <= 0 and not self.quiet_mode:
|
||||
print(f"\n⚠️ Iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} iterations used)")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue