diff --git a/run_agent.py b/run_agent.py index 080682de86..ce8954d6da 100644 --- a/run_agent.py +++ b/run_agent.py @@ -516,6 +516,9 @@ class AIAgent: checkpoint_max_snapshots: int = 50, pass_session_id: bool = False, persist_session: bool = True, + use_streaming: bool = True, + temperature: float = None, + insert_reasoning: bool = True, ): """ Initialize the AI Agent. @@ -559,11 +562,17 @@ class AIAgent: When provided and Honcho is enabled in config, enables persistent cross-session user modeling. honcho_manager: Optional shared HonchoSessionManager owned by the caller. honcho_config: Optional HonchoClientConfig corresponding to honcho_manager. + use_streaming (bool): Whether to use streaming for API calls (default: True) + temperature (float): Temperature for model responses (optional, uses model default if not set) + insert_reasoning (bool): Whether to insert reasoning into the API response (default: True) """ _install_safe_stdio() self.model = model self.max_iterations = max_iterations + self.use_streaming = use_streaming + self.temperature = temperature + self.insert_reasoning = insert_reasoning # Shared iteration budget — parent creates, children inherit. # Consumed by every LLM turn across parent + all subagents. self.iteration_budget = iteration_budget or IterationBudget(max_iterations) @@ -5063,6 +5072,8 @@ class AIAgent: "messages": sanitized_messages, "timeout": float(os.getenv("HERMES_API_TIMEOUT", 1800.0)), } + if self.temperature is not None: + api_kwargs["temperature"] = self.temperature if self.tools: api_kwargs["tools"] = self.tools @@ -5390,7 +5401,7 @@ class AIAgent: api_msg = msg.copy() if msg.get("role") == "assistant": reasoning = msg.get("reasoning") - if reasoning: + if reasoning and self.insert_reasoning: api_msg["reasoning_content"] = reasoning api_msg.pop("reasoning", None) api_msg.pop("finish_reason", None) @@ -6387,6 +6398,7 @@ class AIAgent: stream_callback: Optional[callable] = None, persist_user_message: Optional[str] = None, sync_honcho: bool = True, + dont_review: bool = False, ) -> Dict[str, Any]: """ Run a complete conversation with tool calling until completion. @@ -6404,7 +6416,7 @@ class AIAgent: synthetic prefixes. sync_honcho: When False, skip writing the final synthetic turn back to Honcho or queuing follow-up prefetch work. - + dont_review: When True, skip reviewing memory and skills. Returns: Dict: Complete conversation result with final response and message history """ @@ -6741,7 +6753,7 @@ class AIAgent: # This ensures multi-turn reasoning context is preserved if msg.get("role") == "assistant": reasoning_text = msg.get("reasoning") - if reasoning_text: + if reasoning_text and self.insert_reasoning: # Add reasoning_content for API compatibility (Moonshot AI, Novita, OpenRouter) api_msg["reasoning_content"] = reasoning_text @@ -6869,7 +6881,7 @@ class AIAgent: if self.thinking_callback: self.thinking_callback("") - _use_streaming = True + _use_streaming = self.use_streaming if not self._has_stream_consumers(): # No display/TTS consumer. Still prefer streaming for # health checking, but skip for Mock clients in tests @@ -7520,7 +7532,7 @@ class AIAgent: force=True, ) - if is_context_length_error: + if is_context_length_error and self.compression_enabled: compressor = self.context_compressor old_ctx = compressor.context_length @@ -7589,6 +7601,14 @@ class AIAgent: "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.", "partial": True } + elif is_context_length_error and not self.compression_enabled: + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.", + "partial": True + } # Check for non-retryable client errors (4xx HTTP status codes). # These indicate a problem with the request itself (bad model ID, @@ -8493,7 +8513,9 @@ class AIAgent: and "skill_manage" in self.valid_tool_names): _should_review_skills = True self._iters_since_skill = 0 - + if dont_review: + _should_review_memory = False + _should_review_skills = False # Background memory/skill review — runs AFTER the response is delivered # so it never competes with the user's task for model attention. if final_response and not interrupted and (_should_review_memory or _should_review_skills):