diff --git a/gemini_nothinking.sh b/gemini_nothinking.sh new file mode 100644 index 000000000..bc45f23eb --- /dev/null +++ b/gemini_nothinking.sh @@ -0,0 +1,12 @@ +python batch_runner.py \ + --dataset_file="source-data/agent_tasks_eval.jsonl" \ + --batch_size=1 \ + --run_name="agenttasks_eval_gemini-4.5-3-nothinking" \ + --distribution="science" \ + --model="gemini-3-pro-preview" \ + --base_url="https://generativelanguage.googleapis.com/v1beta/openai/" \ + --api_key="${GEMINI_API_KEY}" \ + --num_workers=10 \ + --max_turns=60 \ + --verbose \ + --ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. If you require API keys please check which ones already exist in your environment variables in a way that does not read them." diff --git a/run_agent.py b/run_agent.py index 526f30e2c..12977023d 100644 --- a/run_agent.py +++ b/run_agent.py @@ -401,6 +401,8 @@ class AIAgent: if self.verbose_logging: logging.debug(f"API Request - Model: {self.model}, Messages: {len(messages)}, Tools: {len(self.tools) if self.tools else 0}") logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}") + # Log the last few messages to see if thought_signature is present + logging.debug(f"Last message content: {json.dumps(messages[-1] if messages else {}, indent=2)}") api_start_time = time.time() retry_count = 0 @@ -459,22 +461,58 @@ class AIAgent: if self.verbose_logging: for tc in assistant_message.tool_calls: logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...") + # Debug: Check what attributes are available on tool_call + logging.debug(f"Tool call attributes: {dir(tc)}") + # Try to dump the model to see all fields + if hasattr(tc, 'model_dump'): + logging.debug(f"Tool call data: {tc.model_dump()}") # Add assistant message with tool calls to conversation + # Extract thought_signature if present (required for Gemini models) + tool_calls_data = [] + for tool_call in assistant_message.tool_calls: + tool_call_dict = { + "id": tool_call.id, + "type": tool_call.type, + "function": { + "name": tool_call.function.name, + "arguments": tool_call.function.arguments + } + } + # Try multiple ways to access thought_signature (Gemini-specific) + # Gemini uses extra_content.google.thought_signature structure + thought_sig = None + + # Method 1: Check extra_content attribute + if hasattr(tool_call, 'extra_content'): + extra = tool_call.extra_content + if isinstance(extra, dict) and 'google' in extra: + thought_sig = extra['google'].get('thought_signature') + + # Method 2: Check model_dump() if available (Pydantic v2) + if thought_sig is None and hasattr(tool_call, 'model_dump'): + dumped = tool_call.model_dump() + if 'extra_content' in dumped and isinstance(dumped['extra_content'], dict): + google_data = dumped['extra_content'].get('google', {}) + thought_sig = google_data.get('thought_signature') + + if thought_sig is not None: + tool_call_dict["extra_content"] = { + "google": { + "thought_signature": thought_sig + } + } + if self.verbose_logging: + logging.debug(f"Captured thought_signature for tool call {tool_call.id}") + elif self.verbose_logging: + logging.debug(f"No thought_signature found for tool call {tool_call.id}") + + tool_calls_data.append(tool_call_dict) + messages.append({ "role": "assistant", "content": assistant_message.content, - "tool_calls": [ - { - "id": tool_call.id, - "type": tool_call.type, - "function": { - "name": tool_call.function.name, - "arguments": tool_call.function.arguments - } - } - for tool_call in assistant_message.tool_calls - ] + "tool_calls": tool_calls_data }) # Execute each tool call @@ -508,6 +546,7 @@ class AIAgent: logging.debug(f"Tool result preview: {result_preview}...") # Add tool result to conversation + # Note: thought_signature should NOT be in tool responses, only in assistant messages messages.append({ "role": "tool", "content": function_result,