mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops
- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema. - Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns. - Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics. - Updated documentation and comments for clarity on new features and changes.
This commit is contained in:
parent
f12ea1bc02
commit
dd70d57b9b
4 changed files with 277 additions and 90 deletions
125
batch_runner.py
125
batch_runner.py
|
|
@ -41,24 +41,17 @@ from toolset_distributions import (
|
|||
sample_toolsets_from_distribution,
|
||||
validate_distribution
|
||||
)
|
||||
from model_tools import TOOL_TO_TOOLSET_MAP
|
||||
|
||||
|
||||
# Global configuration for worker processes
|
||||
_WORKER_CONFIG = {}
|
||||
|
||||
# All possible tools - used to ensure consistent schema across all trajectory entries
|
||||
# This is required because Arrow/Parquet (used by HuggingFace datasets) needs identical schemas
|
||||
ALL_POSSIBLE_TOOLS = {
|
||||
'terminal', 'web_search', 'web_extract',
|
||||
'vision_analyze', 'image_generate', 'mixture_of_agents',
|
||||
# Skills tools
|
||||
'skills_categories', 'skills_list', 'skill_view',
|
||||
# Browser automation tools
|
||||
'browser_navigate', 'browser_snapshot', 'browser_click',
|
||||
'browser_type', 'browser_scroll', 'browser_back',
|
||||
'browser_press', 'browser_close', 'browser_get_images',
|
||||
'browser_vision'
|
||||
}
|
||||
# All possible tools - auto-derived from the master mapping in model_tools.py.
|
||||
# This stays in sync automatically when new tools are added to TOOL_TO_TOOLSET_MAP.
|
||||
# Used for consistent schema in Arrow/Parquet (HuggingFace datasets) and for
|
||||
# filtering corrupted entries during trajectory combination.
|
||||
ALL_POSSIBLE_TOOLS = set(TOOL_TO_TOOLSET_MAP.keys())
|
||||
|
||||
# Default stats for tools that weren't used
|
||||
DEFAULT_TOOL_STATS = {'count': 0, 'success': 0, 'failure': 0}
|
||||
|
|
@ -200,6 +193,42 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i
|
|||
return tool_stats
|
||||
|
||||
|
||||
def _extract_reasoning_stats(messages: List[Dict[str, Any]]) -> Dict[str, int]:
|
||||
"""
|
||||
Count how many assistant turns have reasoning vs no reasoning.
|
||||
|
||||
Checks for <REASONING_SCRATCHPAD> in content or a non-empty 'reasoning' field
|
||||
(native thinking tokens). Returns counts for tracking reasoning coverage.
|
||||
|
||||
Args:
|
||||
messages: Message history
|
||||
|
||||
Returns:
|
||||
Dict with 'total_assistant_turns', 'turns_with_reasoning', 'turns_without_reasoning'
|
||||
"""
|
||||
total = 0
|
||||
with_reasoning = 0
|
||||
|
||||
for msg in messages:
|
||||
if msg.get("role") != "assistant":
|
||||
continue
|
||||
total += 1
|
||||
|
||||
content = msg.get("content", "") or ""
|
||||
has_scratchpad = "<REASONING_SCRATCHPAD>" in content
|
||||
has_native_reasoning = bool(msg.get("reasoning", "").strip()) if msg.get("reasoning") else False
|
||||
|
||||
if has_scratchpad or has_native_reasoning:
|
||||
with_reasoning += 1
|
||||
|
||||
return {
|
||||
"total_assistant_turns": total,
|
||||
"turns_with_reasoning": with_reasoning,
|
||||
"turns_without_reasoning": total - with_reasoning,
|
||||
"has_any_reasoning": with_reasoning > 0,
|
||||
}
|
||||
|
||||
|
||||
def _process_single_prompt(
|
||||
prompt_index: int,
|
||||
prompt_data: Dict[str, Any],
|
||||
|
|
@ -255,6 +284,9 @@ def _process_single_prompt(
|
|||
# Extract tool usage statistics
|
||||
tool_stats = _extract_tool_stats(result["messages"])
|
||||
|
||||
# Extract reasoning coverage stats
|
||||
reasoning_stats = _extract_reasoning_stats(result["messages"])
|
||||
|
||||
# Convert to trajectory format (using existing method)
|
||||
trajectory = agent._convert_to_trajectory_format(
|
||||
result["messages"],
|
||||
|
|
@ -267,6 +299,7 @@ def _process_single_prompt(
|
|||
"prompt_index": prompt_index,
|
||||
"trajectory": trajectory,
|
||||
"tool_stats": tool_stats,
|
||||
"reasoning_stats": reasoning_stats,
|
||||
"completed": result["completed"],
|
||||
"partial": result.get("partial", False),
|
||||
"api_calls": result["api_calls"],
|
||||
|
|
@ -335,7 +368,9 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
|||
|
||||
# Initialize aggregated stats for this batch
|
||||
batch_tool_stats = {}
|
||||
batch_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0}
|
||||
completed_in_batch = []
|
||||
discarded_no_reasoning = 0
|
||||
|
||||
# Process each prompt sequentially in this batch
|
||||
for prompt_index, prompt_data in prompts_to_process:
|
||||
|
|
@ -349,6 +384,13 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
|||
|
||||
# Save trajectory if successful
|
||||
if result["success"] and result["trajectory"]:
|
||||
# Discard samples with zero reasoning across all turns
|
||||
reasoning = result.get("reasoning_stats", {})
|
||||
if not reasoning.get("has_any_reasoning", True):
|
||||
print(f" 🚫 Prompt {prompt_index} discarded (no reasoning in any turn)")
|
||||
discarded_no_reasoning += 1
|
||||
continue
|
||||
|
||||
# Get and normalize tool stats for consistent schema across all entries
|
||||
raw_tool_stats = result.get("tool_stats", {})
|
||||
tool_stats = _normalize_tool_stats(raw_tool_stats)
|
||||
|
|
@ -389,6 +431,10 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
|||
batch_tool_stats[tool_name]["success"] += stats["success"]
|
||||
batch_tool_stats[tool_name]["failure"] += stats["failure"]
|
||||
|
||||
# Aggregate reasoning stats
|
||||
for key in batch_reasoning_stats:
|
||||
batch_reasoning_stats[key] += result.get("reasoning_stats", {}).get(key, 0)
|
||||
|
||||
# Only mark as completed if successfully saved (failed prompts can be retried on resume)
|
||||
if result["success"] and result["trajectory"]:
|
||||
completed_in_batch.append(prompt_index)
|
||||
|
|
@ -404,6 +450,8 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
|
|||
"processed": len(prompts_to_process),
|
||||
"skipped": len(batch_data) - len(prompts_to_process),
|
||||
"tool_stats": batch_tool_stats,
|
||||
"reasoning_stats": batch_reasoning_stats,
|
||||
"discarded_no_reasoning": discarded_no_reasoning,
|
||||
"completed_prompts": completed_in_batch
|
||||
}
|
||||
|
||||
|
|
@ -434,6 +482,7 @@ class BatchRunner:
|
|||
max_tokens: int = None,
|
||||
reasoning_config: Dict[str, Any] = None,
|
||||
prefill_messages: List[Dict[str, Any]] = None,
|
||||
max_samples: int = None,
|
||||
):
|
||||
"""
|
||||
Initialize the batch runner.
|
||||
|
|
@ -458,6 +507,7 @@ class BatchRunner:
|
|||
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
|
||||
reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking)
|
||||
prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming)
|
||||
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
|
||||
"""
|
||||
self.dataset_file = Path(dataset_file)
|
||||
self.batch_size = batch_size
|
||||
|
|
@ -478,6 +528,7 @@ class BatchRunner:
|
|||
self.max_tokens = max_tokens
|
||||
self.reasoning_config = reasoning_config
|
||||
self.prefill_messages = prefill_messages
|
||||
self.max_samples = max_samples
|
||||
|
||||
# Validate distribution
|
||||
if not validate_distribution(distribution):
|
||||
|
|
@ -493,8 +544,12 @@ class BatchRunner:
|
|||
# Statistics file
|
||||
self.stats_file = self.output_dir / "statistics.json"
|
||||
|
||||
# Load dataset
|
||||
# Load dataset (and optionally truncate to max_samples)
|
||||
self.dataset = self._load_dataset()
|
||||
if self.max_samples and self.max_samples < len(self.dataset):
|
||||
full_count = len(self.dataset)
|
||||
self.dataset = self.dataset[:self.max_samples]
|
||||
print(f"✂️ Truncated dataset from {full_count} to {self.max_samples} samples (--max_samples)")
|
||||
|
||||
# Create batches
|
||||
self.batches = self._create_batches()
|
||||
|
|
@ -812,6 +867,8 @@ class BatchRunner:
|
|||
|
||||
# Aggregate all batch statistics and update checkpoint
|
||||
all_completed_prompts = list(completed_prompts_set)
|
||||
total_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0}
|
||||
|
||||
for batch_result in results:
|
||||
# Add newly completed prompts
|
||||
all_completed_prompts.extend(batch_result.get("completed_prompts", []))
|
||||
|
|
@ -828,6 +885,10 @@ class BatchRunner:
|
|||
total_tool_stats[tool_name]["count"] += stats["count"]
|
||||
total_tool_stats[tool_name]["success"] += stats["success"]
|
||||
total_tool_stats[tool_name]["failure"] += stats["failure"]
|
||||
|
||||
# Aggregate reasoning stats
|
||||
for key in total_reasoning_stats:
|
||||
total_reasoning_stats[key] += batch_result.get("reasoning_stats", {}).get(key, 0)
|
||||
|
||||
# Save final checkpoint
|
||||
checkpoint_data["completed_prompts"] = all_completed_prompts
|
||||
|
|
@ -850,15 +911,8 @@ class BatchRunner:
|
|||
combined_file = self.output_dir / "trajectories.jsonl"
|
||||
print(f"\n📦 Combining ALL batch files into {combined_file.name}...")
|
||||
|
||||
VALID_TOOLS = {'web_search', 'web_extract', 'terminal', 'vision_analyze',
|
||||
'image_generate', 'mixture_of_agents',
|
||||
# Skills tools
|
||||
'skills_categories', 'skills_list', 'skill_view',
|
||||
# Browser automation tools
|
||||
'browser_navigate', 'browser_snapshot', 'browser_click',
|
||||
'browser_type', 'browser_scroll', 'browser_back',
|
||||
'browser_press', 'browser_close', 'browser_get_images',
|
||||
'browser_vision'}
|
||||
# Valid tools auto-derived from model_tools.py — no manual updates needed
|
||||
VALID_TOOLS = ALL_POSSIBLE_TOOLS
|
||||
|
||||
total_entries = 0
|
||||
filtered_entries = 0
|
||||
|
|
@ -907,7 +961,8 @@ class BatchRunner:
|
|||
"model": self.model,
|
||||
"completed_at": datetime.now().isoformat(),
|
||||
"duration_seconds": round(time.time() - start_time, 2),
|
||||
"tool_statistics": total_tool_stats
|
||||
"tool_statistics": total_tool_stats,
|
||||
"reasoning_statistics": total_reasoning_stats,
|
||||
}
|
||||
|
||||
with open(self.stats_file, 'w', encoding='utf-8') as f:
|
||||
|
|
@ -945,6 +1000,25 @@ class BatchRunner:
|
|||
else:
|
||||
print("No tool calls were made during this run.")
|
||||
|
||||
# Print reasoning coverage stats
|
||||
total_discarded = sum(r.get("discarded_no_reasoning", 0) for r in results)
|
||||
|
||||
print(f"\n🧠 Reasoning Coverage:")
|
||||
print("-" * 70)
|
||||
total_turns = total_reasoning_stats["total_assistant_turns"]
|
||||
with_reasoning = total_reasoning_stats["turns_with_reasoning"]
|
||||
without_reasoning = total_reasoning_stats["turns_without_reasoning"]
|
||||
if total_turns > 0:
|
||||
pct_with = round(with_reasoning / total_turns * 100, 1)
|
||||
pct_without = round(without_reasoning / total_turns * 100, 1)
|
||||
print(f" Total assistant turns: {total_turns:,}")
|
||||
print(f" With reasoning: {with_reasoning:,} ({pct_with}%)")
|
||||
print(f" Without reasoning: {without_reasoning:,} ({pct_without}%)")
|
||||
else:
|
||||
print(" No assistant turns recorded.")
|
||||
if total_discarded > 0:
|
||||
print(f" 🚫 Samples discarded (zero reasoning): {total_discarded:,}")
|
||||
|
||||
print(f"\n💾 Results saved to: {self.output_dir}")
|
||||
print(f" - Trajectories: trajectories.jsonl (combined)")
|
||||
print(f" - Individual batches: batch_*.jsonl (for debugging)")
|
||||
|
|
@ -975,6 +1049,7 @@ def main(
|
|||
reasoning_effort: str = None,
|
||||
reasoning_disabled: bool = False,
|
||||
prefill_messages_file: str = None,
|
||||
max_samples: int = None,
|
||||
):
|
||||
"""
|
||||
Run batch processing of agent prompts from a dataset.
|
||||
|
|
@ -1002,6 +1077,7 @@ def main(
|
|||
reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "xhigh")
|
||||
reasoning_disabled (bool): Completely disable reasoning/thinking tokens (default: False)
|
||||
prefill_messages_file (str): Path to JSON file containing prefill messages (list of {role, content} dicts)
|
||||
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
|
||||
|
||||
Examples:
|
||||
# Basic usage
|
||||
|
|
@ -1110,6 +1186,7 @@ def main(
|
|||
max_tokens=max_tokens,
|
||||
reasoning_config=reasoning_config,
|
||||
prefill_messages=prefill_messages,
|
||||
max_samples=max_samples,
|
||||
)
|
||||
|
||||
runner.run(resume=resume)
|
||||
|
|
|
|||
165
model_tools.py
165
model_tools.py
|
|
@ -700,13 +700,21 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
|
|||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"description": "Read a file with pagination support. Returns content with line numbers in 'LINE_NUM|CONTENT' format. For binary files (images), returns base64-encoded data. If file not found, suggests similar filenames.",
|
||||
"description": (
|
||||
"Read a file with pagination support. Preferred over 'cat' in the terminal because it "
|
||||
"provides line numbers, handles binary/image files, and suggests similar filenames if "
|
||||
"the file is not found.\n\n"
|
||||
"**Output format:** Each line is returned as 'LINE_NUM|CONTENT' for easy reference.\n"
|
||||
"**Binary files:** Detected automatically; images (png/jpg/gif/webp) are returned as base64 with MIME type and dimensions.\n"
|
||||
"**Large files:** Use offset and limit to paginate. The response includes total line count and a hint for the next page.\n"
|
||||
"**Paths:** Supports absolute paths, relative paths (from working directory), and ~ expansion."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Path to the file to read (absolute or relative)"
|
||||
"description": "Path to the file to read (absolute, relative, or ~/path)"
|
||||
},
|
||||
"offset": {
|
||||
"type": "integer",
|
||||
|
|
@ -729,17 +737,25 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
|
|||
"type": "function",
|
||||
"function": {
|
||||
"name": "write_file",
|
||||
"description": "Write content to a file. Creates parent directories automatically. Returns bytes written and lint check results for supported languages.",
|
||||
"description": (
|
||||
"Write content to a file, completely replacing any existing content. Creates parent "
|
||||
"directories automatically if they don't exist. Preferred over 'echo' or heredoc in the "
|
||||
"terminal because it safely handles special characters, newlines, and shell metacharacters "
|
||||
"without escaping issues.\n\n"
|
||||
"**Important:** This OVERWRITES the entire file. To make targeted edits to an existing file, "
|
||||
"use the 'patch' tool instead.\n"
|
||||
"**Paths:** Supports absolute paths, relative paths, and ~ expansion."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Path to the file to write (will be created if doesn't exist)"
|
||||
"description": "Path to the file to write (will be created if it doesn't exist, overwritten if it does)"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "Content to write to the file"
|
||||
"description": "Complete content to write to the file"
|
||||
}
|
||||
},
|
||||
"required": ["path", "content"]
|
||||
|
|
@ -750,36 +766,48 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
|
|||
"type": "function",
|
||||
"function": {
|
||||
"name": "patch",
|
||||
"description": "Modify files using either simple string replacement or V4A patch format. Mode 'replace' does find-and-replace with fuzzy matching. Mode 'patch' applies multi-file changes using V4A format (*** Begin/End Patch). Auto-runs syntax checks on modified files.",
|
||||
"description": (
|
||||
"Modify existing files using targeted edits. Preferred over 'sed' or manual rewriting because "
|
||||
"it uses intelligent fuzzy matching that tolerates minor whitespace and indentation differences, "
|
||||
"and auto-runs syntax checks (Python, JS, TS, Go, Rust) after editing.\n\n"
|
||||
"**Replace mode (recommended):** Find a unique string in the file and replace it. Uses a "
|
||||
"9-strategy fuzzy matching chain (exact → line-trimmed → whitespace-normalized → "
|
||||
"indentation-flexible → context-aware) so small formatting differences won't cause failures. "
|
||||
"Returns a unified diff showing exactly what changed.\n\n"
|
||||
"**Patch mode:** Apply multi-file changes using V4A patch format for large-scale edits across "
|
||||
"multiple files in one call.\n\n"
|
||||
"**Auto-lint:** After every edit, automatically runs syntax checks and reports errors so you "
|
||||
"can fix them immediately."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["replace", "patch"],
|
||||
"description": "Edit mode: 'replace' for string replacement, 'patch' for V4A patch format",
|
||||
"description": "Edit mode: 'replace' for targeted find-and-replace, 'patch' for V4A multi-file patches",
|
||||
"default": "replace"
|
||||
},
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "File path (required for 'replace' mode)"
|
||||
"description": "File path to edit (required for 'replace' mode)"
|
||||
},
|
||||
"old_string": {
|
||||
"type": "string",
|
||||
"description": "Text to find and replace (required for 'replace' mode). Must be unique in file unless replace_all=true"
|
||||
"description": "Text to find in the file (required for 'replace' mode). Must be unique in the file unless replace_all=true. Include enough surrounding context to ensure uniqueness."
|
||||
},
|
||||
"new_string": {
|
||||
"type": "string",
|
||||
"description": "Replacement text (required for 'replace' mode)"
|
||||
"description": "Replacement text (required for 'replace' mode). Can be empty string to delete the matched text."
|
||||
},
|
||||
"replace_all": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all occurrences instead of requiring unique match (default: false)",
|
||||
"description": "Replace all occurrences instead of requiring a unique match (default: false)",
|
||||
"default": False
|
||||
},
|
||||
"patch": {
|
||||
"type": "string",
|
||||
"description": "V4A format patch content (required for 'patch' mode). Format: *** Begin Patch / *** Update File: path / @@ context @@ / -removed / +added / *** End Patch"
|
||||
"description": "V4A format patch content (required for 'patch' mode). Format:\n*** Begin Patch\n*** Update File: path/to/file\n@@ context hint @@\n context line\n-removed line\n+added line\n*** End Patch"
|
||||
}
|
||||
},
|
||||
"required": ["mode"]
|
||||
|
|
@ -790,7 +818,16 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
|
|||
"type": "function",
|
||||
"function": {
|
||||
"name": "search",
|
||||
"description": "Search for content in files or search for files by name. Use target='content' to search inside files (like grep), or target='files' to find files by name pattern (like glob/find). Results sorted by modification time (newest first).",
|
||||
"description": (
|
||||
"Search for content inside files or find files by name. Preferred over 'grep' or 'find' "
|
||||
"in the terminal because it uses ripgrep (fast) with automatic fallback to grep, handles "
|
||||
"pagination, and returns structured results sorted by modification time (newest first).\n\n"
|
||||
"**Content search (target='content'):** Regex-powered search inside files with optional "
|
||||
"file type filtering and context lines. Three output modes: full matches with line numbers, "
|
||||
"file paths only, or match counts per file.\n\n"
|
||||
"**File search (target='files'):** Find files by glob pattern (e.g., '*.py', '*config*'). "
|
||||
"Results sorted by modification time so recently changed files appear first."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -801,12 +838,12 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
|
|||
"target": {
|
||||
"type": "string",
|
||||
"enum": ["content", "files"],
|
||||
"description": "Search mode: 'content' searches inside files, 'files' searches for files by name",
|
||||
"description": "Search mode: 'content' searches inside files (like grep/rg), 'files' searches for files by name (like find/glob)",
|
||||
"default": "content"
|
||||
},
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Directory or file to search in (default: current directory)",
|
||||
"description": "Directory or file to search in (default: current working directory)",
|
||||
"default": "."
|
||||
},
|
||||
"file_glob": {
|
||||
|
|
@ -815,7 +852,7 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
|
|||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of results (default: 50)",
|
||||
"description": "Maximum number of results to return (default: 50)",
|
||||
"default": 50
|
||||
},
|
||||
"offset": {
|
||||
|
|
@ -826,12 +863,12 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]:
|
|||
"output_mode": {
|
||||
"type": "string",
|
||||
"enum": ["content", "files_only", "count"],
|
||||
"description": "For target='content': 'content' shows matches, 'files_only' shows file paths, 'count' shows match counts per file",
|
||||
"description": "Output format for content search: 'content' shows matching lines with line numbers, 'files_only' lists file paths, 'count' shows match counts per file",
|
||||
"default": "content"
|
||||
},
|
||||
"context": {
|
||||
"type": "integer",
|
||||
"description": "Lines of context around matches (only for target='content', output_mode='content')",
|
||||
"description": "Number of lines to show before and after each match (only for target='content', output_mode='content')",
|
||||
"default": 0
|
||||
}
|
||||
},
|
||||
|
|
@ -909,6 +946,53 @@ def get_all_tool_names() -> List[str]:
|
|||
return tool_names
|
||||
|
||||
|
||||
# Master mapping of every tool name → its toolset.
|
||||
# This is the single source of truth for all valid tool names in the system.
|
||||
# Import TOOL_TO_TOOLSET_MAP from here whenever you need to check valid tools.
|
||||
TOOL_TO_TOOLSET_MAP = {
|
||||
"web_search": "web_tools",
|
||||
"web_extract": "web_tools",
|
||||
"terminal": "terminal_tools",
|
||||
"vision_analyze": "vision_tools",
|
||||
"mixture_of_agents": "moa_tools",
|
||||
"image_generate": "image_tools",
|
||||
# Skills tools
|
||||
"skills_categories": "skills_tools",
|
||||
"skills_list": "skills_tools",
|
||||
"skill_view": "skills_tools",
|
||||
# Browser automation tools
|
||||
"browser_navigate": "browser_tools",
|
||||
"browser_snapshot": "browser_tools",
|
||||
"browser_click": "browser_tools",
|
||||
"browser_type": "browser_tools",
|
||||
"browser_scroll": "browser_tools",
|
||||
"browser_back": "browser_tools",
|
||||
"browser_press": "browser_tools",
|
||||
"browser_close": "browser_tools",
|
||||
"browser_get_images": "browser_tools",
|
||||
"browser_vision": "browser_tools",
|
||||
# Cronjob management tools
|
||||
"schedule_cronjob": "cronjob_tools",
|
||||
"list_cronjobs": "cronjob_tools",
|
||||
"remove_cronjob": "cronjob_tools",
|
||||
# RL Training tools
|
||||
"rl_list_environments": "rl_tools",
|
||||
"rl_select_environment": "rl_tools",
|
||||
"rl_get_current_config": "rl_tools",
|
||||
"rl_edit_config": "rl_tools",
|
||||
"rl_start_training": "rl_tools",
|
||||
"rl_check_status": "rl_tools",
|
||||
"rl_stop_training": "rl_tools",
|
||||
"rl_get_results": "rl_tools",
|
||||
"rl_list_runs": "rl_tools",
|
||||
# File manipulation tools
|
||||
"read_file": "file_tools",
|
||||
"write_file": "file_tools",
|
||||
"patch": "file_tools",
|
||||
"search": "file_tools",
|
||||
}
|
||||
|
||||
|
||||
def get_toolset_for_tool(tool_name: str) -> str:
|
||||
"""
|
||||
Get the toolset that a tool belongs to.
|
||||
|
|
@ -919,50 +1003,7 @@ def get_toolset_for_tool(tool_name: str) -> str:
|
|||
Returns:
|
||||
str: Name of the toolset, or "unknown" if not found
|
||||
"""
|
||||
toolset_mapping = {
|
||||
"web_search": "web_tools",
|
||||
"web_extract": "web_tools",
|
||||
"terminal": "terminal_tools",
|
||||
"vision_analyze": "vision_tools",
|
||||
"mixture_of_agents": "moa_tools",
|
||||
"image_generate": "image_tools",
|
||||
# Skills tools
|
||||
"skills_categories": "skills_tools",
|
||||
"skills_list": "skills_tools",
|
||||
"skill_view": "skills_tools",
|
||||
# Browser automation tools
|
||||
"browser_navigate": "browser_tools",
|
||||
"browser_snapshot": "browser_tools",
|
||||
"browser_click": "browser_tools",
|
||||
"browser_type": "browser_tools",
|
||||
"browser_scroll": "browser_tools",
|
||||
"browser_back": "browser_tools",
|
||||
"browser_press": "browser_tools",
|
||||
"browser_close": "browser_tools",
|
||||
"browser_get_images": "browser_tools",
|
||||
"browser_vision": "browser_tools",
|
||||
# Cronjob management tools
|
||||
"schedule_cronjob": "cronjob_tools",
|
||||
"list_cronjobs": "cronjob_tools",
|
||||
"remove_cronjob": "cronjob_tools",
|
||||
# RL Training tools
|
||||
"rl_list_environments": "rl_tools",
|
||||
"rl_select_environment": "rl_tools",
|
||||
"rl_get_current_config": "rl_tools",
|
||||
"rl_edit_config": "rl_tools",
|
||||
"rl_start_training": "rl_tools",
|
||||
"rl_check_status": "rl_tools",
|
||||
"rl_stop_training": "rl_tools",
|
||||
"rl_get_results": "rl_tools",
|
||||
"rl_list_runs": "rl_tools",
|
||||
# File manipulation tools
|
||||
"read_file": "file_tools",
|
||||
"write_file": "file_tools",
|
||||
"patch": "file_tools",
|
||||
"search": "file_tools",
|
||||
}
|
||||
|
||||
return toolset_mapping.get(tool_name, "unknown")
|
||||
return TOOL_TO_TOOLSET_MAP.get(tool_name, "unknown")
|
||||
|
||||
|
||||
def get_tool_definitions(
|
||||
|
|
|
|||
69
run_agent.py
69
run_agent.py
|
|
@ -1120,6 +1120,24 @@ class AIAgent:
|
|||
return content
|
||||
return content.replace("<REASONING_SCRATCHPAD>", "<think>").replace("</REASONING_SCRATCHPAD>", "</think>")
|
||||
|
||||
@staticmethod
|
||||
def _has_incomplete_scratchpad(content: str) -> bool:
|
||||
"""
|
||||
Check if content has an opening <REASONING_SCRATCHPAD> without a closing tag.
|
||||
|
||||
This indicates the model ran out of output tokens mid-reasoning, producing
|
||||
a broken turn that shouldn't be saved. The caller should retry or discard.
|
||||
|
||||
Args:
|
||||
content: Assistant message content to check
|
||||
|
||||
Returns:
|
||||
True if there's an unclosed scratchpad tag
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
return "<REASONING_SCRATCHPAD>" in content and "</REASONING_SCRATCHPAD>" not in content
|
||||
|
||||
def _convert_to_trajectory_format(self, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert internal message format to trajectory format for saving.
|
||||
|
|
@ -1204,6 +1222,11 @@ class AIAgent:
|
|||
}
|
||||
content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n"
|
||||
|
||||
# Ensure every gpt turn has a <think> block (empty if no reasoning)
|
||||
# so the format is consistent for training data
|
||||
if "<think>" not in content:
|
||||
content = "<think>\n</think>\n" + content
|
||||
|
||||
trajectory.append({
|
||||
"from": "gpt",
|
||||
"value": content.rstrip()
|
||||
|
|
@ -1256,6 +1279,10 @@ class AIAgent:
|
|||
raw_content = msg["content"] or ""
|
||||
content += self._convert_scratchpad_to_think(raw_content)
|
||||
|
||||
# Ensure every gpt turn has a <think> block (empty if no reasoning)
|
||||
if "<think>" not in content:
|
||||
content = "<think>\n</think>\n" + content
|
||||
|
||||
trajectory.append({
|
||||
"from": "gpt",
|
||||
"value": content.strip()
|
||||
|
|
@ -1903,6 +1930,48 @@ class AIAgent:
|
|||
if assistant_message.content and not self.quiet_mode:
|
||||
print(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
|
||||
|
||||
# Check for incomplete <REASONING_SCRATCHPAD> (opened but never closed)
|
||||
# This means the model ran out of output tokens mid-reasoning — retry up to 2 times
|
||||
if self._has_incomplete_scratchpad(assistant_message.content or ""):
|
||||
if not hasattr(self, '_incomplete_scratchpad_retries'):
|
||||
self._incomplete_scratchpad_retries = 0
|
||||
self._incomplete_scratchpad_retries += 1
|
||||
|
||||
print(f"{self.log_prefix}⚠️ Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
|
||||
|
||||
if self._incomplete_scratchpad_retries <= 2:
|
||||
print(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...")
|
||||
# Don't add the broken message, just retry
|
||||
continue
|
||||
else:
|
||||
# Max retries - discard this turn and save as partial
|
||||
print(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.")
|
||||
self._incomplete_scratchpad_retries = 0
|
||||
|
||||
rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
|
||||
|
||||
try:
|
||||
cleanup_vm(effective_task_id)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
cleanup_browser(effective_task_id)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"final_response": None,
|
||||
"messages": rolled_back_messages,
|
||||
"api_calls": api_call_count,
|
||||
"completed": False,
|
||||
"partial": True,
|
||||
"error": "Incomplete REASONING_SCRATCHPAD after 2 retries"
|
||||
}
|
||||
|
||||
# Reset incomplete scratchpad counter on clean response
|
||||
if hasattr(self, '_incomplete_scratchpad_retries'):
|
||||
self._incomplete_scratchpad_retries = 0
|
||||
|
||||
# Check for tool calls
|
||||
if assistant_message.tool_calls:
|
||||
if not self.quiet_mode:
|
||||
|
|
|
|||
|
|
@ -198,10 +198,10 @@ DISTRIBUTIONS = {
|
|||
"toolsets": {
|
||||
"terminal": 97, # 97% - terminal almost always available
|
||||
"file": 97, # 97% - file tools almost always available
|
||||
"web": 15, # 15% - web search/scrape for documentation
|
||||
"browser": 10, # 10% - browser occasionally for web interaction
|
||||
"vision": 8, # 8% - vision analysis rarely
|
||||
"image_gen": 3 # 3% - image generation very rarely
|
||||
"web": 97, # 15% - web search/scrape for documentation
|
||||
"browser": 75, # 10% - browser occasionally for web interaction
|
||||
"vision": 50, # 8% - vision analysis rarely
|
||||
"image_gen": 10 # 3% - image generation very rarely
|
||||
}
|
||||
},
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue