Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing. - Implemented filtering of corrupted entries during batch file combination based on valid tool names. - Updated terminal tool to improve command execution and error handling, including retry logic for transient failures. - Refactored model tools to use a simple terminal tool with no session persistence. - Improved logging and error messages for invalid API responses and tool calls. - Introduced chunked processing for large content in web tools to manage size limitations effectively.
2026-05-02 02:01:47 +00:00 · 2026-01-10 05:56:26 +00:00 · 2026-01-10 05:56:26 +00:00 · 4071ba29da
commit 4071ba29da
parent 21f9e2df40
8 changed files with 572 additions and 111 deletions
--- a/batch_runner.py
+++ b/batch_runner.py
@ -192,6 +192,7 @@ def _process_single_prompt(
            "trajectory": trajectory,
            "tool_stats": tool_stats,
            "completed": result["completed"],
+            "partial": result.get("partial", False),
            "api_calls": result["api_calls"],
            "toolsets_used": selected_toolsets,
            "metadata": {
@ -272,13 +273,23 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
        
        # Save trajectory if successful
        if result["success"] and result["trajectory"]:
+            # Create tool_error_counts mapping tool names to their failure counts
+            tool_stats = result.get("tool_stats", {})
+            tool_error_counts = {
+                tool_name: stats.get("failure", 0) 
+                for tool_name, stats in tool_stats.items()
+            }
+            
            trajectory_entry = {
                "prompt_index": prompt_index,
                "conversations": result["trajectory"],
                "metadata": result["metadata"],
                "completed": result["completed"],
+                "partial": result.get("partial", False),  # True if stopped due to invalid tool calls
                "api_calls": result["api_calls"],
-                "toolsets_used": result["toolsets_used"]
+                "toolsets_used": result["toolsets_used"],
+                "tool_stats": tool_stats,  # Full stats: {tool: {count, success, failure}}
+                "tool_error_counts": tool_error_counts  # Simple: {tool: failure_count}
            }
            
            # Append to batch output file
@ -601,18 +612,44 @@ class BatchRunner:
                stats["failure_rate"] = 0.0
        
        # Combine all batch files into a single trajectories.jsonl file
+        # Also filter out corrupted entries (where model generated invalid tool names)
        combined_file = self.output_dir / "trajectories.jsonl"
        print(f"\n📦 Combining batch files into {combined_file.name}...")
        
+        VALID_TOOLS = {'web_search', 'web_extract', 'web_crawl', 'terminal', 'vision_analyze', 
+                       'image_generate', 'mixture_of_agents'}
+        
+        total_entries = 0
+        filtered_entries = 0
+        
        with open(combined_file, 'w', encoding='utf-8') as outfile:
            for batch_num in range(len(self.batches)):
                batch_file = self.output_dir / f"batch_{batch_num}.jsonl"
                if batch_file.exists():
                    with open(batch_file, 'r', encoding='utf-8') as infile:
                        for line in infile:
-                            outfile.write(line)
+                            total_entries += 1
+                            try:
+                                data = json.loads(line)
+                                tool_stats = data.get('tool_stats', {})
+                                
+                                # Check for invalid tool names (model hallucinations)
+                                invalid_tools = [k for k in tool_stats.keys() if k not in VALID_TOOLS]
+                                
+                                if invalid_tools:
+                                    filtered_entries += 1
+                                    invalid_preview = invalid_tools[0][:50] + "..." if len(invalid_tools[0]) > 50 else invalid_tools[0]
+                                    print(f"   ⚠️  Filtering corrupted entry (batch {batch_num}): invalid tool '{invalid_preview}'")
+                                    continue
+                                
+                                outfile.write(line)
+                            except json.JSONDecodeError:
+                                filtered_entries += 1
+                                print(f"   ⚠️  Filtering invalid JSON entry (batch {batch_num})")
        
-        print(f"✅ Combined {len(self.batches)} batch files into trajectories.jsonl")
+        if filtered_entries > 0:
+            print(f"⚠️  Filtered {filtered_entries} corrupted entries out of {total_entries} total")
+        print(f"✅ Combined {len(self.batches)} batch files into trajectories.jsonl ({total_entries - filtered_entries} entries)")
        
        # Save final statistics
        final_stats = {