diff --git a/batch_runner.py b/batch_runner.py index c3679cdc3..9a3f3e3b4 100644 --- a/batch_runner.py +++ b/batch_runner.py @@ -41,24 +41,17 @@ from toolset_distributions import ( sample_toolsets_from_distribution, validate_distribution ) +from model_tools import TOOL_TO_TOOLSET_MAP # Global configuration for worker processes _WORKER_CONFIG = {} -# All possible tools - used to ensure consistent schema across all trajectory entries -# This is required because Arrow/Parquet (used by HuggingFace datasets) needs identical schemas -ALL_POSSIBLE_TOOLS = { - 'terminal', 'web_search', 'web_extract', - 'vision_analyze', 'image_generate', 'mixture_of_agents', - # Skills tools - 'skills_categories', 'skills_list', 'skill_view', - # Browser automation tools - 'browser_navigate', 'browser_snapshot', 'browser_click', - 'browser_type', 'browser_scroll', 'browser_back', - 'browser_press', 'browser_close', 'browser_get_images', - 'browser_vision' -} +# All possible tools - auto-derived from the master mapping in model_tools.py. +# This stays in sync automatically when new tools are added to TOOL_TO_TOOLSET_MAP. +# Used for consistent schema in Arrow/Parquet (HuggingFace datasets) and for +# filtering corrupted entries during trajectory combination. +ALL_POSSIBLE_TOOLS = set(TOOL_TO_TOOLSET_MAP.keys()) # Default stats for tools that weren't used DEFAULT_TOOL_STATS = {'count': 0, 'success': 0, 'failure': 0} @@ -200,6 +193,42 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i return tool_stats +def _extract_reasoning_stats(messages: List[Dict[str, Any]]) -> Dict[str, int]: + """ + Count how many assistant turns have reasoning vs no reasoning. + + Checks for in content or a non-empty 'reasoning' field + (native thinking tokens). Returns counts for tracking reasoning coverage. + + Args: + messages: Message history + + Returns: + Dict with 'total_assistant_turns', 'turns_with_reasoning', 'turns_without_reasoning' + """ + total = 0 + with_reasoning = 0 + + for msg in messages: + if msg.get("role") != "assistant": + continue + total += 1 + + content = msg.get("content", "") or "" + has_scratchpad = "" in content + has_native_reasoning = bool(msg.get("reasoning", "").strip()) if msg.get("reasoning") else False + + if has_scratchpad or has_native_reasoning: + with_reasoning += 1 + + return { + "total_assistant_turns": total, + "turns_with_reasoning": with_reasoning, + "turns_without_reasoning": total - with_reasoning, + "has_any_reasoning": with_reasoning > 0, + } + + def _process_single_prompt( prompt_index: int, prompt_data: Dict[str, Any], @@ -255,6 +284,9 @@ def _process_single_prompt( # Extract tool usage statistics tool_stats = _extract_tool_stats(result["messages"]) + # Extract reasoning coverage stats + reasoning_stats = _extract_reasoning_stats(result["messages"]) + # Convert to trajectory format (using existing method) trajectory = agent._convert_to_trajectory_format( result["messages"], @@ -267,6 +299,7 @@ def _process_single_prompt( "prompt_index": prompt_index, "trajectory": trajectory, "tool_stats": tool_stats, + "reasoning_stats": reasoning_stats, "completed": result["completed"], "partial": result.get("partial", False), "api_calls": result["api_calls"], @@ -335,7 +368,9 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]: # Initialize aggregated stats for this batch batch_tool_stats = {} + batch_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0} completed_in_batch = [] + discarded_no_reasoning = 0 # Process each prompt sequentially in this batch for prompt_index, prompt_data in prompts_to_process: @@ -349,6 +384,13 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]: # Save trajectory if successful if result["success"] and result["trajectory"]: + # Discard samples with zero reasoning across all turns + reasoning = result.get("reasoning_stats", {}) + if not reasoning.get("has_any_reasoning", True): + print(f" 🚫 Prompt {prompt_index} discarded (no reasoning in any turn)") + discarded_no_reasoning += 1 + continue + # Get and normalize tool stats for consistent schema across all entries raw_tool_stats = result.get("tool_stats", {}) tool_stats = _normalize_tool_stats(raw_tool_stats) @@ -389,6 +431,10 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]: batch_tool_stats[tool_name]["success"] += stats["success"] batch_tool_stats[tool_name]["failure"] += stats["failure"] + # Aggregate reasoning stats + for key in batch_reasoning_stats: + batch_reasoning_stats[key] += result.get("reasoning_stats", {}).get(key, 0) + # Only mark as completed if successfully saved (failed prompts can be retried on resume) if result["success"] and result["trajectory"]: completed_in_batch.append(prompt_index) @@ -404,6 +450,8 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]: "processed": len(prompts_to_process), "skipped": len(batch_data) - len(prompts_to_process), "tool_stats": batch_tool_stats, + "reasoning_stats": batch_reasoning_stats, + "discarded_no_reasoning": discarded_no_reasoning, "completed_prompts": completed_in_batch } @@ -434,6 +482,7 @@ class BatchRunner: max_tokens: int = None, reasoning_config: Dict[str, Any] = None, prefill_messages: List[Dict[str, Any]] = None, + max_samples: int = None, ): """ Initialize the batch runner. @@ -458,6 +507,7 @@ class BatchRunner: max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set) reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking) prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming) + max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set) """ self.dataset_file = Path(dataset_file) self.batch_size = batch_size @@ -478,6 +528,7 @@ class BatchRunner: self.max_tokens = max_tokens self.reasoning_config = reasoning_config self.prefill_messages = prefill_messages + self.max_samples = max_samples # Validate distribution if not validate_distribution(distribution): @@ -493,8 +544,12 @@ class BatchRunner: # Statistics file self.stats_file = self.output_dir / "statistics.json" - # Load dataset + # Load dataset (and optionally truncate to max_samples) self.dataset = self._load_dataset() + if self.max_samples and self.max_samples < len(self.dataset): + full_count = len(self.dataset) + self.dataset = self.dataset[:self.max_samples] + print(f"āœ‚ļø Truncated dataset from {full_count} to {self.max_samples} samples (--max_samples)") # Create batches self.batches = self._create_batches() @@ -812,6 +867,8 @@ class BatchRunner: # Aggregate all batch statistics and update checkpoint all_completed_prompts = list(completed_prompts_set) + total_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0} + for batch_result in results: # Add newly completed prompts all_completed_prompts.extend(batch_result.get("completed_prompts", [])) @@ -828,6 +885,10 @@ class BatchRunner: total_tool_stats[tool_name]["count"] += stats["count"] total_tool_stats[tool_name]["success"] += stats["success"] total_tool_stats[tool_name]["failure"] += stats["failure"] + + # Aggregate reasoning stats + for key in total_reasoning_stats: + total_reasoning_stats[key] += batch_result.get("reasoning_stats", {}).get(key, 0) # Save final checkpoint checkpoint_data["completed_prompts"] = all_completed_prompts @@ -850,15 +911,8 @@ class BatchRunner: combined_file = self.output_dir / "trajectories.jsonl" print(f"\nšŸ“¦ Combining ALL batch files into {combined_file.name}...") - VALID_TOOLS = {'web_search', 'web_extract', 'terminal', 'vision_analyze', - 'image_generate', 'mixture_of_agents', - # Skills tools - 'skills_categories', 'skills_list', 'skill_view', - # Browser automation tools - 'browser_navigate', 'browser_snapshot', 'browser_click', - 'browser_type', 'browser_scroll', 'browser_back', - 'browser_press', 'browser_close', 'browser_get_images', - 'browser_vision'} + # Valid tools auto-derived from model_tools.py — no manual updates needed + VALID_TOOLS = ALL_POSSIBLE_TOOLS total_entries = 0 filtered_entries = 0 @@ -907,7 +961,8 @@ class BatchRunner: "model": self.model, "completed_at": datetime.now().isoformat(), "duration_seconds": round(time.time() - start_time, 2), - "tool_statistics": total_tool_stats + "tool_statistics": total_tool_stats, + "reasoning_statistics": total_reasoning_stats, } with open(self.stats_file, 'w', encoding='utf-8') as f: @@ -945,6 +1000,25 @@ class BatchRunner: else: print("No tool calls were made during this run.") + # Print reasoning coverage stats + total_discarded = sum(r.get("discarded_no_reasoning", 0) for r in results) + + print(f"\n🧠 Reasoning Coverage:") + print("-" * 70) + total_turns = total_reasoning_stats["total_assistant_turns"] + with_reasoning = total_reasoning_stats["turns_with_reasoning"] + without_reasoning = total_reasoning_stats["turns_without_reasoning"] + if total_turns > 0: + pct_with = round(with_reasoning / total_turns * 100, 1) + pct_without = round(without_reasoning / total_turns * 100, 1) + print(f" Total assistant turns: {total_turns:,}") + print(f" With reasoning: {with_reasoning:,} ({pct_with}%)") + print(f" Without reasoning: {without_reasoning:,} ({pct_without}%)") + else: + print(" No assistant turns recorded.") + if total_discarded > 0: + print(f" 🚫 Samples discarded (zero reasoning): {total_discarded:,}") + print(f"\nšŸ’¾ Results saved to: {self.output_dir}") print(f" - Trajectories: trajectories.jsonl (combined)") print(f" - Individual batches: batch_*.jsonl (for debugging)") @@ -975,6 +1049,7 @@ def main( reasoning_effort: str = None, reasoning_disabled: bool = False, prefill_messages_file: str = None, + max_samples: int = None, ): """ Run batch processing of agent prompts from a dataset. @@ -1002,6 +1077,7 @@ def main( reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "xhigh") reasoning_disabled (bool): Completely disable reasoning/thinking tokens (default: False) prefill_messages_file (str): Path to JSON file containing prefill messages (list of {role, content} dicts) + max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set) Examples: # Basic usage @@ -1110,6 +1186,7 @@ def main( max_tokens=max_tokens, reasoning_config=reasoning_config, prefill_messages=prefill_messages, + max_samples=max_samples, ) runner.run(resume=resume) diff --git a/model_tools.py b/model_tools.py index 710615099..9210e732e 100644 --- a/model_tools.py +++ b/model_tools.py @@ -700,13 +700,21 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]: "type": "function", "function": { "name": "read_file", - "description": "Read a file with pagination support. Returns content with line numbers in 'LINE_NUM|CONTENT' format. For binary files (images), returns base64-encoded data. If file not found, suggests similar filenames.", + "description": ( + "Read a file with pagination support. Preferred over 'cat' in the terminal because it " + "provides line numbers, handles binary/image files, and suggests similar filenames if " + "the file is not found.\n\n" + "**Output format:** Each line is returned as 'LINE_NUM|CONTENT' for easy reference.\n" + "**Binary files:** Detected automatically; images (png/jpg/gif/webp) are returned as base64 with MIME type and dimensions.\n" + "**Large files:** Use offset and limit to paginate. The response includes total line count and a hint for the next page.\n" + "**Paths:** Supports absolute paths, relative paths (from working directory), and ~ expansion." + ), "parameters": { "type": "object", "properties": { "path": { "type": "string", - "description": "Path to the file to read (absolute or relative)" + "description": "Path to the file to read (absolute, relative, or ~/path)" }, "offset": { "type": "integer", @@ -729,17 +737,25 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]: "type": "function", "function": { "name": "write_file", - "description": "Write content to a file. Creates parent directories automatically. Returns bytes written and lint check results for supported languages.", + "description": ( + "Write content to a file, completely replacing any existing content. Creates parent " + "directories automatically if they don't exist. Preferred over 'echo' or heredoc in the " + "terminal because it safely handles special characters, newlines, and shell metacharacters " + "without escaping issues.\n\n" + "**Important:** This OVERWRITES the entire file. To make targeted edits to an existing file, " + "use the 'patch' tool instead.\n" + "**Paths:** Supports absolute paths, relative paths, and ~ expansion." + ), "parameters": { "type": "object", "properties": { "path": { "type": "string", - "description": "Path to the file to write (will be created if doesn't exist)" + "description": "Path to the file to write (will be created if it doesn't exist, overwritten if it does)" }, "content": { "type": "string", - "description": "Content to write to the file" + "description": "Complete content to write to the file" } }, "required": ["path", "content"] @@ -750,36 +766,48 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]: "type": "function", "function": { "name": "patch", - "description": "Modify files using either simple string replacement or V4A patch format. Mode 'replace' does find-and-replace with fuzzy matching. Mode 'patch' applies multi-file changes using V4A format (*** Begin/End Patch). Auto-runs syntax checks on modified files.", + "description": ( + "Modify existing files using targeted edits. Preferred over 'sed' or manual rewriting because " + "it uses intelligent fuzzy matching that tolerates minor whitespace and indentation differences, " + "and auto-runs syntax checks (Python, JS, TS, Go, Rust) after editing.\n\n" + "**Replace mode (recommended):** Find a unique string in the file and replace it. Uses a " + "9-strategy fuzzy matching chain (exact → line-trimmed → whitespace-normalized → " + "indentation-flexible → context-aware) so small formatting differences won't cause failures. " + "Returns a unified diff showing exactly what changed.\n\n" + "**Patch mode:** Apply multi-file changes using V4A patch format for large-scale edits across " + "multiple files in one call.\n\n" + "**Auto-lint:** After every edit, automatically runs syntax checks and reports errors so you " + "can fix them immediately." + ), "parameters": { "type": "object", "properties": { "mode": { "type": "string", "enum": ["replace", "patch"], - "description": "Edit mode: 'replace' for string replacement, 'patch' for V4A patch format", + "description": "Edit mode: 'replace' for targeted find-and-replace, 'patch' for V4A multi-file patches", "default": "replace" }, "path": { "type": "string", - "description": "File path (required for 'replace' mode)" + "description": "File path to edit (required for 'replace' mode)" }, "old_string": { "type": "string", - "description": "Text to find and replace (required for 'replace' mode). Must be unique in file unless replace_all=true" + "description": "Text to find in the file (required for 'replace' mode). Must be unique in the file unless replace_all=true. Include enough surrounding context to ensure uniqueness." }, "new_string": { "type": "string", - "description": "Replacement text (required for 'replace' mode)" + "description": "Replacement text (required for 'replace' mode). Can be empty string to delete the matched text." }, "replace_all": { "type": "boolean", - "description": "Replace all occurrences instead of requiring unique match (default: false)", + "description": "Replace all occurrences instead of requiring a unique match (default: false)", "default": False }, "patch": { "type": "string", - "description": "V4A format patch content (required for 'patch' mode). Format: *** Begin Patch / *** Update File: path / @@ context @@ / -removed / +added / *** End Patch" + "description": "V4A format patch content (required for 'patch' mode). Format:\n*** Begin Patch\n*** Update File: path/to/file\n@@ context hint @@\n context line\n-removed line\n+added line\n*** End Patch" } }, "required": ["mode"] @@ -790,7 +818,16 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]: "type": "function", "function": { "name": "search", - "description": "Search for content in files or search for files by name. Use target='content' to search inside files (like grep), or target='files' to find files by name pattern (like glob/find). Results sorted by modification time (newest first).", + "description": ( + "Search for content inside files or find files by name. Preferred over 'grep' or 'find' " + "in the terminal because it uses ripgrep (fast) with automatic fallback to grep, handles " + "pagination, and returns structured results sorted by modification time (newest first).\n\n" + "**Content search (target='content'):** Regex-powered search inside files with optional " + "file type filtering and context lines. Three output modes: full matches with line numbers, " + "file paths only, or match counts per file.\n\n" + "**File search (target='files'):** Find files by glob pattern (e.g., '*.py', '*config*'). " + "Results sorted by modification time so recently changed files appear first." + ), "parameters": { "type": "object", "properties": { @@ -801,12 +838,12 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]: "target": { "type": "string", "enum": ["content", "files"], - "description": "Search mode: 'content' searches inside files, 'files' searches for files by name", + "description": "Search mode: 'content' searches inside files (like grep/rg), 'files' searches for files by name (like find/glob)", "default": "content" }, "path": { "type": "string", - "description": "Directory or file to search in (default: current directory)", + "description": "Directory or file to search in (default: current working directory)", "default": "." }, "file_glob": { @@ -815,7 +852,7 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]: }, "limit": { "type": "integer", - "description": "Maximum number of results (default: 50)", + "description": "Maximum number of results to return (default: 50)", "default": 50 }, "offset": { @@ -826,12 +863,12 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]: "output_mode": { "type": "string", "enum": ["content", "files_only", "count"], - "description": "For target='content': 'content' shows matches, 'files_only' shows file paths, 'count' shows match counts per file", + "description": "Output format for content search: 'content' shows matching lines with line numbers, 'files_only' lists file paths, 'count' shows match counts per file", "default": "content" }, "context": { "type": "integer", - "description": "Lines of context around matches (only for target='content', output_mode='content')", + "description": "Number of lines to show before and after each match (only for target='content', output_mode='content')", "default": 0 } }, @@ -909,6 +946,53 @@ def get_all_tool_names() -> List[str]: return tool_names +# Master mapping of every tool name → its toolset. +# This is the single source of truth for all valid tool names in the system. +# Import TOOL_TO_TOOLSET_MAP from here whenever you need to check valid tools. +TOOL_TO_TOOLSET_MAP = { + "web_search": "web_tools", + "web_extract": "web_tools", + "terminal": "terminal_tools", + "vision_analyze": "vision_tools", + "mixture_of_agents": "moa_tools", + "image_generate": "image_tools", + # Skills tools + "skills_categories": "skills_tools", + "skills_list": "skills_tools", + "skill_view": "skills_tools", + # Browser automation tools + "browser_navigate": "browser_tools", + "browser_snapshot": "browser_tools", + "browser_click": "browser_tools", + "browser_type": "browser_tools", + "browser_scroll": "browser_tools", + "browser_back": "browser_tools", + "browser_press": "browser_tools", + "browser_close": "browser_tools", + "browser_get_images": "browser_tools", + "browser_vision": "browser_tools", + # Cronjob management tools + "schedule_cronjob": "cronjob_tools", + "list_cronjobs": "cronjob_tools", + "remove_cronjob": "cronjob_tools", + # RL Training tools + "rl_list_environments": "rl_tools", + "rl_select_environment": "rl_tools", + "rl_get_current_config": "rl_tools", + "rl_edit_config": "rl_tools", + "rl_start_training": "rl_tools", + "rl_check_status": "rl_tools", + "rl_stop_training": "rl_tools", + "rl_get_results": "rl_tools", + "rl_list_runs": "rl_tools", + # File manipulation tools + "read_file": "file_tools", + "write_file": "file_tools", + "patch": "file_tools", + "search": "file_tools", +} + + def get_toolset_for_tool(tool_name: str) -> str: """ Get the toolset that a tool belongs to. @@ -919,50 +1003,7 @@ def get_toolset_for_tool(tool_name: str) -> str: Returns: str: Name of the toolset, or "unknown" if not found """ - toolset_mapping = { - "web_search": "web_tools", - "web_extract": "web_tools", - "terminal": "terminal_tools", - "vision_analyze": "vision_tools", - "mixture_of_agents": "moa_tools", - "image_generate": "image_tools", - # Skills tools - "skills_categories": "skills_tools", - "skills_list": "skills_tools", - "skill_view": "skills_tools", - # Browser automation tools - "browser_navigate": "browser_tools", - "browser_snapshot": "browser_tools", - "browser_click": "browser_tools", - "browser_type": "browser_tools", - "browser_scroll": "browser_tools", - "browser_back": "browser_tools", - "browser_press": "browser_tools", - "browser_close": "browser_tools", - "browser_get_images": "browser_tools", - "browser_vision": "browser_tools", - # Cronjob management tools - "schedule_cronjob": "cronjob_tools", - "list_cronjobs": "cronjob_tools", - "remove_cronjob": "cronjob_tools", - # RL Training tools - "rl_list_environments": "rl_tools", - "rl_select_environment": "rl_tools", - "rl_get_current_config": "rl_tools", - "rl_edit_config": "rl_tools", - "rl_start_training": "rl_tools", - "rl_check_status": "rl_tools", - "rl_stop_training": "rl_tools", - "rl_get_results": "rl_tools", - "rl_list_runs": "rl_tools", - # File manipulation tools - "read_file": "file_tools", - "write_file": "file_tools", - "patch": "file_tools", - "search": "file_tools", - } - - return toolset_mapping.get(tool_name, "unknown") + return TOOL_TO_TOOLSET_MAP.get(tool_name, "unknown") def get_tool_definitions( diff --git a/run_agent.py b/run_agent.py index eeb24dd8f..078f8f0f4 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1120,6 +1120,24 @@ class AIAgent: return content return content.replace("", "").replace("", "") + @staticmethod + def _has_incomplete_scratchpad(content: str) -> bool: + """ + Check if content has an opening without a closing tag. + + This indicates the model ran out of output tokens mid-reasoning, producing + a broken turn that shouldn't be saved. The caller should retry or discard. + + Args: + content: Assistant message content to check + + Returns: + True if there's an unclosed scratchpad tag + """ + if not content: + return False + return "" in content and "" not in content + def _convert_to_trajectory_format(self, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]: """ Convert internal message format to trajectory format for saving. @@ -1204,6 +1222,11 @@ class AIAgent: } content += f"\n{json.dumps(tool_call_json, ensure_ascii=False)}\n\n" + # Ensure every gpt turn has a block (empty if no reasoning) + # so the format is consistent for training data + if "" not in content: + content = "\n\n" + content + trajectory.append({ "from": "gpt", "value": content.rstrip() @@ -1256,6 +1279,10 @@ class AIAgent: raw_content = msg["content"] or "" content += self._convert_scratchpad_to_think(raw_content) + # Ensure every gpt turn has a block (empty if no reasoning) + if "" not in content: + content = "\n\n" + content + trajectory.append({ "from": "gpt", "value": content.strip() @@ -1903,6 +1930,48 @@ class AIAgent: if assistant_message.content and not self.quiet_mode: print(f"{self.log_prefix}šŸ¤– Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}") + # Check for incomplete (opened but never closed) + # This means the model ran out of output tokens mid-reasoning — retry up to 2 times + if self._has_incomplete_scratchpad(assistant_message.content or ""): + if not hasattr(self, '_incomplete_scratchpad_retries'): + self._incomplete_scratchpad_retries = 0 + self._incomplete_scratchpad_retries += 1 + + print(f"{self.log_prefix}āš ļø Incomplete detected (opened but never closed)") + + if self._incomplete_scratchpad_retries <= 2: + print(f"{self.log_prefix}šŸ”„ Retrying API call ({self._incomplete_scratchpad_retries}/2)...") + # Don't add the broken message, just retry + continue + else: + # Max retries - discard this turn and save as partial + print(f"{self.log_prefix}āŒ Max retries (2) for incomplete scratchpad. Saving as partial.") + self._incomplete_scratchpad_retries = 0 + + rolled_back_messages = self._get_messages_up_to_last_assistant(messages) + + try: + cleanup_vm(effective_task_id) + except Exception: + pass + try: + cleanup_browser(effective_task_id) + except Exception: + pass + + return { + "final_response": None, + "messages": rolled_back_messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Incomplete REASONING_SCRATCHPAD after 2 retries" + } + + # Reset incomplete scratchpad counter on clean response + if hasattr(self, '_incomplete_scratchpad_retries'): + self._incomplete_scratchpad_retries = 0 + # Check for tool calls if assistant_message.tool_calls: if not self.quiet_mode: diff --git a/toolset_distributions.py b/toolset_distributions.py index 7f829c278..0dc23b887 100644 --- a/toolset_distributions.py +++ b/toolset_distributions.py @@ -198,10 +198,10 @@ DISTRIBUTIONS = { "toolsets": { "terminal": 97, # 97% - terminal almost always available "file": 97, # 97% - file tools almost always available - "web": 15, # 15% - web search/scrape for documentation - "browser": 10, # 10% - browser occasionally for web interaction - "vision": 8, # 8% - vision analysis rarely - "image_gen": 3 # 3% - image generation very rarely + "web": 97, # 15% - web search/scrape for documentation + "browser": 75, # 10% - browser occasionally for web interaction + "vision": 50, # 8% - vision analysis rarely + "image_gen": 10 # 3% - image generation very rarely } },