Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.
This commit is contained in:
teknium 2026-02-08 20:19:14 +00:00
parent f12ea1bc02
commit dd70d57b9b
4 changed files with 277 additions and 90 deletions

View file

@ -41,24 +41,17 @@ from toolset_distributions import (
sample_toolsets_from_distribution,
validate_distribution
)
from model_tools import TOOL_TO_TOOLSET_MAP
# Global configuration for worker processes
_WORKER_CONFIG = {}
# All possible tools - used to ensure consistent schema across all trajectory entries
# This is required because Arrow/Parquet (used by HuggingFace datasets) needs identical schemas
ALL_POSSIBLE_TOOLS = {
'terminal', 'web_search', 'web_extract',
'vision_analyze', 'image_generate', 'mixture_of_agents',
# Skills tools
'skills_categories', 'skills_list', 'skill_view',
# Browser automation tools
'browser_navigate', 'browser_snapshot', 'browser_click',
'browser_type', 'browser_scroll', 'browser_back',
'browser_press', 'browser_close', 'browser_get_images',
'browser_vision'
}
# All possible tools - auto-derived from the master mapping in model_tools.py.
# This stays in sync automatically when new tools are added to TOOL_TO_TOOLSET_MAP.
# Used for consistent schema in Arrow/Parquet (HuggingFace datasets) and for
# filtering corrupted entries during trajectory combination.
ALL_POSSIBLE_TOOLS = set(TOOL_TO_TOOLSET_MAP.keys())
# Default stats for tools that weren't used
DEFAULT_TOOL_STATS = {'count': 0, 'success': 0, 'failure': 0}
@ -200,6 +193,42 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i
return tool_stats
def _extract_reasoning_stats(messages: List[Dict[str, Any]]) -> Dict[str, int]:
"""
Count how many assistant turns have reasoning vs no reasoning.
Checks for <REASONING_SCRATCHPAD> in content or a non-empty 'reasoning' field
(native thinking tokens). Returns counts for tracking reasoning coverage.
Args:
messages: Message history
Returns:
Dict with 'total_assistant_turns', 'turns_with_reasoning', 'turns_without_reasoning'
"""
total = 0
with_reasoning = 0
for msg in messages:
if msg.get("role") != "assistant":
continue
total += 1
content = msg.get("content", "") or ""
has_scratchpad = "<REASONING_SCRATCHPAD>" in content
has_native_reasoning = bool(msg.get("reasoning", "").strip()) if msg.get("reasoning") else False
if has_scratchpad or has_native_reasoning:
with_reasoning += 1
return {
"total_assistant_turns": total,
"turns_with_reasoning": with_reasoning,
"turns_without_reasoning": total - with_reasoning,
"has_any_reasoning": with_reasoning > 0,
}
def _process_single_prompt(
prompt_index: int,
prompt_data: Dict[str, Any],
@ -255,6 +284,9 @@ def _process_single_prompt(
# Extract tool usage statistics
tool_stats = _extract_tool_stats(result["messages"])
# Extract reasoning coverage stats
reasoning_stats = _extract_reasoning_stats(result["messages"])
# Convert to trajectory format (using existing method)
trajectory = agent._convert_to_trajectory_format(
result["messages"],
@ -267,6 +299,7 @@ def _process_single_prompt(
"prompt_index": prompt_index,
"trajectory": trajectory,
"tool_stats": tool_stats,
"reasoning_stats": reasoning_stats,
"completed": result["completed"],
"partial": result.get("partial", False),
"api_calls": result["api_calls"],
@ -335,7 +368,9 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
# Initialize aggregated stats for this batch
batch_tool_stats = {}
batch_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0}
completed_in_batch = []
discarded_no_reasoning = 0
# Process each prompt sequentially in this batch
for prompt_index, prompt_data in prompts_to_process:
@ -349,6 +384,13 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
# Save trajectory if successful
if result["success"] and result["trajectory"]:
# Discard samples with zero reasoning across all turns
reasoning = result.get("reasoning_stats", {})
if not reasoning.get("has_any_reasoning", True):
print(f" 🚫 Prompt {prompt_index} discarded (no reasoning in any turn)")
discarded_no_reasoning += 1
continue
# Get and normalize tool stats for consistent schema across all entries
raw_tool_stats = result.get("tool_stats", {})
tool_stats = _normalize_tool_stats(raw_tool_stats)
@ -389,6 +431,10 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
batch_tool_stats[tool_name]["success"] += stats["success"]
batch_tool_stats[tool_name]["failure"] += stats["failure"]
# Aggregate reasoning stats
for key in batch_reasoning_stats:
batch_reasoning_stats[key] += result.get("reasoning_stats", {}).get(key, 0)
# Only mark as completed if successfully saved (failed prompts can be retried on resume)
if result["success"] and result["trajectory"]:
completed_in_batch.append(prompt_index)
@ -404,6 +450,8 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
"processed": len(prompts_to_process),
"skipped": len(batch_data) - len(prompts_to_process),
"tool_stats": batch_tool_stats,
"reasoning_stats": batch_reasoning_stats,
"discarded_no_reasoning": discarded_no_reasoning,
"completed_prompts": completed_in_batch
}
@ -434,6 +482,7 @@ class BatchRunner:
max_tokens: int = None,
reasoning_config: Dict[str, Any] = None,
prefill_messages: List[Dict[str, Any]] = None,
max_samples: int = None,
):
"""
Initialize the batch runner.
@ -458,6 +507,7 @@ class BatchRunner:
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking)
prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming)
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
"""
self.dataset_file = Path(dataset_file)
self.batch_size = batch_size
@ -478,6 +528,7 @@ class BatchRunner:
self.max_tokens = max_tokens
self.reasoning_config = reasoning_config
self.prefill_messages = prefill_messages
self.max_samples = max_samples
# Validate distribution
if not validate_distribution(distribution):
@ -493,8 +544,12 @@ class BatchRunner:
# Statistics file
self.stats_file = self.output_dir / "statistics.json"
# Load dataset
# Load dataset (and optionally truncate to max_samples)
self.dataset = self._load_dataset()
if self.max_samples and self.max_samples < len(self.dataset):
full_count = len(self.dataset)
self.dataset = self.dataset[:self.max_samples]
print(f"✂️ Truncated dataset from {full_count} to {self.max_samples} samples (--max_samples)")
# Create batches
self.batches = self._create_batches()
@ -812,6 +867,8 @@ class BatchRunner:
# Aggregate all batch statistics and update checkpoint
all_completed_prompts = list(completed_prompts_set)
total_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0}
for batch_result in results:
# Add newly completed prompts
all_completed_prompts.extend(batch_result.get("completed_prompts", []))
@ -828,6 +885,10 @@ class BatchRunner:
total_tool_stats[tool_name]["count"] += stats["count"]
total_tool_stats[tool_name]["success"] += stats["success"]
total_tool_stats[tool_name]["failure"] += stats["failure"]
# Aggregate reasoning stats
for key in total_reasoning_stats:
total_reasoning_stats[key] += batch_result.get("reasoning_stats", {}).get(key, 0)
# Save final checkpoint
checkpoint_data["completed_prompts"] = all_completed_prompts
@ -850,15 +911,8 @@ class BatchRunner:
combined_file = self.output_dir / "trajectories.jsonl"
print(f"\n📦 Combining ALL batch files into {combined_file.name}...")
VALID_TOOLS = {'web_search', 'web_extract', 'terminal', 'vision_analyze',
'image_generate', 'mixture_of_agents',
# Skills tools
'skills_categories', 'skills_list', 'skill_view',
# Browser automation tools
'browser_navigate', 'browser_snapshot', 'browser_click',
'browser_type', 'browser_scroll', 'browser_back',
'browser_press', 'browser_close', 'browser_get_images',
'browser_vision'}
# Valid tools auto-derived from model_tools.py — no manual updates needed
VALID_TOOLS = ALL_POSSIBLE_TOOLS
total_entries = 0
filtered_entries = 0
@ -907,7 +961,8 @@ class BatchRunner:
"model": self.model,
"completed_at": datetime.now().isoformat(),
"duration_seconds": round(time.time() - start_time, 2),
"tool_statistics": total_tool_stats
"tool_statistics": total_tool_stats,
"reasoning_statistics": total_reasoning_stats,
}
with open(self.stats_file, 'w', encoding='utf-8') as f:
@ -945,6 +1000,25 @@ class BatchRunner:
else:
print("No tool calls were made during this run.")
# Print reasoning coverage stats
total_discarded = sum(r.get("discarded_no_reasoning", 0) for r in results)
print(f"\n🧠 Reasoning Coverage:")
print("-" * 70)
total_turns = total_reasoning_stats["total_assistant_turns"]
with_reasoning = total_reasoning_stats["turns_with_reasoning"]
without_reasoning = total_reasoning_stats["turns_without_reasoning"]
if total_turns > 0:
pct_with = round(with_reasoning / total_turns * 100, 1)
pct_without = round(without_reasoning / total_turns * 100, 1)
print(f" Total assistant turns: {total_turns:,}")
print(f" With reasoning: {with_reasoning:,} ({pct_with}%)")
print(f" Without reasoning: {without_reasoning:,} ({pct_without}%)")
else:
print(" No assistant turns recorded.")
if total_discarded > 0:
print(f" 🚫 Samples discarded (zero reasoning): {total_discarded:,}")
print(f"\n💾 Results saved to: {self.output_dir}")
print(f" - Trajectories: trajectories.jsonl (combined)")
print(f" - Individual batches: batch_*.jsonl (for debugging)")
@ -975,6 +1049,7 @@ def main(
reasoning_effort: str = None,
reasoning_disabled: bool = False,
prefill_messages_file: str = None,
max_samples: int = None,
):
"""
Run batch processing of agent prompts from a dataset.
@ -1002,6 +1077,7 @@ def main(
reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "xhigh")
reasoning_disabled (bool): Completely disable reasoning/thinking tokens (default: False)
prefill_messages_file (str): Path to JSON file containing prefill messages (list of {role, content} dicts)
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
Examples:
# Basic usage
@ -1110,6 +1186,7 @@ def main(
max_tokens=max_tokens,
reasoning_config=reasoning_config,
prefill_messages=prefill_messages,
max_samples=max_samples,
)
runner.run(resume=resume)

View file

@ -700,13 +700,21 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
"type": "function",
"function": {
"name": "read_file",
"description": "Read a file with pagination support. Returns content with line numbers in 'LINE_NUM|CONTENT' format. For binary files (images), returns base64-encoded data. If file not found, suggests similar filenames.",
"description": (
"Read a file with pagination support. Preferred over 'cat' in the terminal because it "
"provides line numbers, handles binary/image files, and suggests similar filenames if "
"the file is not found.\n\n"
"**Output format:** Each line is returned as 'LINE_NUM|CONTENT' for easy reference.\n"
"**Binary files:** Detected automatically; images (png/jpg/gif/webp) are returned as base64 with MIME type and dimensions.\n"
"**Large files:** Use offset and limit to paginate. The response includes total line count and a hint for the next page.\n"
"**Paths:** Supports absolute paths, relative paths (from working directory), and ~ expansion."
),
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to the file to read (absolute or relative)"
"description": "Path to the file to read (absolute, relative, or ~/path)"
},
"offset": {
"type": "integer",
@ -729,17 +737,25 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
"type": "function",
"function": {
"name": "write_file",
"description": "Write content to a file. Creates parent directories automatically. Returns bytes written and lint check results for supported languages.",
"description": (
"Write content to a file, completely replacing any existing content. Creates parent "
"directories automatically if they don't exist. Preferred over 'echo' or heredoc in the "
"terminal because it safely handles special characters, newlines, and shell metacharacters "
"without escaping issues.\n\n"
"**Important:** This OVERWRITES the entire file. To make targeted edits to an existing file, "
"use the 'patch' tool instead.\n"
"**Paths:** Supports absolute paths, relative paths, and ~ expansion."
),
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to the file to write (will be created if doesn't exist)"
"description": "Path to the file to write (will be created if it doesn't exist, overwritten if it does)"
},
"content": {
"type": "string",
"description": "Content to write to the file"
"description": "Complete content to write to the file"
}
},
"required": ["path", "content"]
@ -750,36 +766,48 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
"type": "function",
"function": {
"name": "patch",
"description": "Modify files using either simple string replacement or V4A patch format. Mode 'replace' does find-and-replace with fuzzy matching. Mode 'patch' applies multi-file changes using V4A format (*** Begin/End Patch). Auto-runs syntax checks on modified files.",
"description": (
"Modify existing files using targeted edits. Preferred over 'sed' or manual rewriting because "
"it uses intelligent fuzzy matching that tolerates minor whitespace and indentation differences, "
"and auto-runs syntax checks (Python, JS, TS, Go, Rust) after editing.\n\n"
"**Replace mode (recommended):** Find a unique string in the file and replace it. Uses a "
"9-strategy fuzzy matching chain (exact → line-trimmed → whitespace-normalized → "
"indentation-flexible → context-aware) so small formatting differences won't cause failures. "
"Returns a unified diff showing exactly what changed.\n\n"
"**Patch mode:** Apply multi-file changes using V4A patch format for large-scale edits across "
"multiple files in one call.\n\n"
"**Auto-lint:** After every edit, automatically runs syntax checks and reports errors so you "
"can fix them immediately."
),
"parameters": {
"type": "object",
"properties": {
"mode": {
"type": "string",
"enum": ["replace", "patch"],
"description": "Edit mode: 'replace' for string replacement, 'patch' for V4A patch format",
"description": "Edit mode: 'replace' for targeted find-and-replace, 'patch' for V4A multi-file patches",
"default": "replace"
},
"path": {
"type": "string",
"description": "File path (required for 'replace' mode)"
"description": "File path to edit (required for 'replace' mode)"
},
"old_string": {
"type": "string",
"description": "Text to find and replace (required for 'replace' mode). Must be unique in file unless replace_all=true"
"description": "Text to find in the file (required for 'replace' mode). Must be unique in the file unless replace_all=true. Include enough surrounding context to ensure uniqueness."
},
"new_string": {
"type": "string",
"description": "Replacement text (required for 'replace' mode)"
"description": "Replacement text (required for 'replace' mode). Can be empty string to delete the matched text."
},
"replace_all": {
"type": "boolean",
"description": "Replace all occurrences instead of requiring unique match (default: false)",
"description": "Replace all occurrences instead of requiring a unique match (default: false)",
"default": False
},
"patch": {
"type": "string",
"description": "V4A format patch content (required for 'patch' mode). Format: *** Begin Patch / *** Update File: path / @@ context @@ / -removed / +added / *** End Patch"
"description": "V4A format patch content (required for 'patch' mode). Format:\n*** Begin Patch\n*** Update File: path/to/file\n@@ context hint @@\n context line\n-removed line\n+added line\n*** End Patch"
}
},
"required": ["mode"]
@ -790,7 +818,16 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
"type": "function",
"function": {
"name": "search",
"description": "Search for content in files or search for files by name. Use target='content' to search inside files (like grep), or target='files' to find files by name pattern (like glob/find). Results sorted by modification time (newest first).",
"description": (
"Search for content inside files or find files by name. Preferred over 'grep' or 'find' "
"in the terminal because it uses ripgrep (fast) with automatic fallback to grep, handles "
"pagination, and returns structured results sorted by modification time (newest first).\n\n"
"**Content search (target='content'):** Regex-powered search inside files with optional "
"file type filtering and context lines. Three output modes: full matches with line numbers, "
"file paths only, or match counts per file.\n\n"
"**File search (target='files'):** Find files by glob pattern (e.g., '*.py', '*config*'). "
"Results sorted by modification time so recently changed files appear first."
),
"parameters": {
"type": "object",
"properties": {
@ -801,12 +838,12 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
"target": {
"type": "string",
"enum": ["content", "files"],
"description": "Search mode: 'content' searches inside files, 'files' searches for files by name",
"description": "Search mode: 'content' searches inside files (like grep/rg), 'files' searches for files by name (like find/glob)",
"default": "content"
},
"path": {
"type": "string",
"description": "Directory or file to search in (default: current directory)",
"description": "Directory or file to search in (default: current working directory)",
"default": "."
},
"file_glob": {
@ -815,7 +852,7 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
},
"limit": {
"type": "integer",
"description": "Maximum number of results (default: 50)",
"description": "Maximum number of results to return (default: 50)",
"default": 50
},
"offset": {
@ -826,12 +863,12 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
"output_mode": {
"type": "string",
"enum": ["content", "files_only", "count"],
"description": "For target='content': 'content' shows matches, 'files_only' shows file paths, 'count' shows match counts per file",
"description": "Output format for content search: 'content' shows matching lines with line numbers, 'files_only' lists file paths, 'count' shows match counts per file",
"default": "content"
},
"context": {
"type": "integer",
"description": "Lines of context around matches (only for target='content', output_mode='content')",
"description": "Number of lines to show before and after each match (only for target='content', output_mode='content')",
"default": 0
}
},
@ -909,6 +946,53 @@ def get_all_tool_names() -> List[str]:
return tool_names
# Master mapping of every tool name → its toolset.
# This is the single source of truth for all valid tool names in the system.
# Import TOOL_TO_TOOLSET_MAP from here whenever you need to check valid tools.
TOOL_TO_TOOLSET_MAP = {
"web_search": "web_tools",
"web_extract": "web_tools",
"terminal": "terminal_tools",
"vision_analyze": "vision_tools",
"mixture_of_agents": "moa_tools",
"image_generate": "image_tools",
# Skills tools
"skills_categories": "skills_tools",
"skills_list": "skills_tools",
"skill_view": "skills_tools",
# Browser automation tools
"browser_navigate": "browser_tools",
"browser_snapshot": "browser_tools",
"browser_click": "browser_tools",
"browser_type": "browser_tools",
"browser_scroll": "browser_tools",
"browser_back": "browser_tools",
"browser_press": "browser_tools",
"browser_close": "browser_tools",
"browser_get_images": "browser_tools",
"browser_vision": "browser_tools",
# Cronjob management tools
"schedule_cronjob": "cronjob_tools",
"list_cronjobs": "cronjob_tools",
"remove_cronjob": "cronjob_tools",
# RL Training tools
"rl_list_environments": "rl_tools",
"rl_select_environment": "rl_tools",
"rl_get_current_config": "rl_tools",
"rl_edit_config": "rl_tools",
"rl_start_training": "rl_tools",
"rl_check_status": "rl_tools",
"rl_stop_training": "rl_tools",
"rl_get_results": "rl_tools",
"rl_list_runs": "rl_tools",
# File manipulation tools
"read_file": "file_tools",
"write_file": "file_tools",
"patch": "file_tools",
"search": "file_tools",
}
def get_toolset_for_tool(tool_name: str) -> str:
"""
Get the toolset that a tool belongs to.
@ -919,50 +1003,7 @@ def get_toolset_for_tool(tool_name: str) -> str:
Returns:
str: Name of the toolset, or "unknown" if not found
"""
toolset_mapping = {
"web_search": "web_tools",
"web_extract": "web_tools",
"terminal": "terminal_tools",
"vision_analyze": "vision_tools",
"mixture_of_agents": "moa_tools",
"image_generate": "image_tools",
# Skills tools
"skills_categories": "skills_tools",
"skills_list": "skills_tools",
"skill_view": "skills_tools",
# Browser automation tools
"browser_navigate": "browser_tools",
"browser_snapshot": "browser_tools",
"browser_click": "browser_tools",
"browser_type": "browser_tools",
"browser_scroll": "browser_tools",
"browser_back": "browser_tools",
"browser_press": "browser_tools",
"browser_close": "browser_tools",
"browser_get_images": "browser_tools",
"browser_vision": "browser_tools",
# Cronjob management tools
"schedule_cronjob": "cronjob_tools",
"list_cronjobs": "cronjob_tools",
"remove_cronjob": "cronjob_tools",
# RL Training tools
"rl_list_environments": "rl_tools",
"rl_select_environment": "rl_tools",
"rl_get_current_config": "rl_tools",
"rl_edit_config": "rl_tools",
"rl_start_training": "rl_tools",
"rl_check_status": "rl_tools",
"rl_stop_training": "rl_tools",
"rl_get_results": "rl_tools",
"rl_list_runs": "rl_tools",
# File manipulation tools
"read_file": "file_tools",
"write_file": "file_tools",
"patch": "file_tools",
"search": "file_tools",
}
return toolset_mapping.get(tool_name, "unknown")
return TOOL_TO_TOOLSET_MAP.get(tool_name, "unknown")
def get_tool_definitions(

View file

@ -1120,6 +1120,24 @@ class AIAgent:
return content
return content.replace("<REASONING_SCRATCHPAD>", "<think>").replace("</REASONING_SCRATCHPAD>", "</think>")
@staticmethod
def _has_incomplete_scratchpad(content: str) -> bool:
"""
Check if content has an opening <REASONING_SCRATCHPAD> without a closing tag.
This indicates the model ran out of output tokens mid-reasoning, producing
a broken turn that shouldn't be saved. The caller should retry or discard.
Args:
content: Assistant message content to check
Returns:
True if there's an unclosed scratchpad tag
"""
if not content:
return False
return "<REASONING_SCRATCHPAD>" in content and "</REASONING_SCRATCHPAD>" not in content
def _convert_to_trajectory_format(self, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]:
"""
Convert internal message format to trajectory format for saving.
@ -1204,6 +1222,11 @@ class AIAgent:
}
content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n"
# Ensure every gpt turn has a <think> block (empty if no reasoning)
# so the format is consistent for training data
if "<think>" not in content:
content = "<think>\n</think>\n" + content
trajectory.append({
"from": "gpt",
"value": content.rstrip()
@ -1256,6 +1279,10 @@ class AIAgent:
raw_content = msg["content"] or ""
content += self._convert_scratchpad_to_think(raw_content)
# Ensure every gpt turn has a <think> block (empty if no reasoning)
if "<think>" not in content:
content = "<think>\n</think>\n" + content
trajectory.append({
"from": "gpt",
"value": content.strip()
@ -1903,6 +1930,48 @@ class AIAgent:
if assistant_message.content and not self.quiet_mode:
print(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
# Check for incomplete <REASONING_SCRATCHPAD> (opened but never closed)
# This means the model ran out of output tokens mid-reasoning — retry up to 2 times
if self._has_incomplete_scratchpad(assistant_message.content or ""):
if not hasattr(self, '_incomplete_scratchpad_retries'):
self._incomplete_scratchpad_retries = 0
self._incomplete_scratchpad_retries += 1
print(f"{self.log_prefix}⚠️ Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
if self._incomplete_scratchpad_retries <= 2:
print(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...")
# Don't add the broken message, just retry
continue
else:
# Max retries - discard this turn and save as partial
print(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.")
self._incomplete_scratchpad_retries = 0
rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
try:
cleanup_vm(effective_task_id)
except Exception:
pass
try:
cleanup_browser(effective_task_id)
except Exception:
pass
return {
"final_response": None,
"messages": rolled_back_messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": "Incomplete REASONING_SCRATCHPAD after 2 retries"
}
# Reset incomplete scratchpad counter on clean response
if hasattr(self, '_incomplete_scratchpad_retries'):
self._incomplete_scratchpad_retries = 0
# Check for tool calls
if assistant_message.tool_calls:
if not self.quiet_mode:

View file

@ -198,10 +198,10 @@ DISTRIBUTIONS = {
"toolsets": {
"terminal": 97, # 97% - terminal almost always available
"file": 97, # 97% - file tools almost always available
"web": 15, # 15% - web search/scrape for documentation
"browser": 10, # 10% - browser occasionally for web interaction
"vision": 8, # 8% - vision analysis rarely
"image_gen": 3 # 3% - image generation very rarely
"web": 97, # 15% - web search/scrape for documentation
"browser": 75, # 10% - browser occasionally for web interaction
"vision": 50, # 8% - vision analysis rarely
"image_gen": 10 # 3% - image generation very rarely
}
},