diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py index 6bd36387835..0db33e1cb36 100644 --- a/agent/prompt_builder.py +++ b/agent/prompt_builder.py @@ -274,6 +274,10 @@ TOOL_USE_ENFORCEMENT_MODELS = ("gpt", "codex", "gemini", "gemma", "grok", "glm") # where GPT models abandon work on partial results, skip prerequisite lookups, # hallucinate instead of using tools, and declare "done" without verification. # Inspired by patterns from OpenAI's GPT-5.4 prompting guide & OpenClaw PR #38953. +# Also applied to xAI Grok — same failure modes in practice (claims completion +# without tool calls, suggests workarounds instead of using existing tools, +# replies with plans/suggestions instead of executing). The body is +# family-agnostic; the OPENAI_ prefix reflects origin, not exclusivity. OPENAI_MODEL_EXECUTION_GUIDANCE = ( "# Execution discipline\n" "\n" diff --git a/agent/system_prompt.py b/agent/system_prompt.py index 52a574101f5..d69d8c32099 100644 --- a/agent/system_prompt.py +++ b/agent/system_prompt.py @@ -156,7 +156,10 @@ def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None) stable_parts.append(GOOGLE_MODEL_OPERATIONAL_GUIDANCE) # OpenAI GPT/Codex execution discipline (tool persistence, # prerequisite checks, verification, anti-hallucination). - if "gpt" in _model_lower or "codex" in _model_lower: + # Also applied to xAI Grok — same failure modes (claims completion + # without tool calls, suggests workarounds instead of using + # existing tools, replies with plans instead of executing). + if "gpt" in _model_lower or "codex" in _model_lower or "grok" in _model_lower: stable_parts.append(OPENAI_MODEL_EXECUTION_GUIDANCE) has_skills_tools = any(name in agent.valid_tool_names for name in ['skills_list', 'skill_view', 'skill_manage']) diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index bc8a044e3ad..55cc8186205 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -1074,6 +1074,40 @@ class TestToolUseEnforcementConfig: prompt = agent._build_system_prompt() assert TOOL_USE_ENFORCEMENT_GUIDANCE not in prompt + def test_auto_injects_for_grok(self): + """xAI Grok / xai-oauth models hit the same enforcement path as GPT.""" + from agent.prompt_builder import TOOL_USE_ENFORCEMENT_GUIDANCE + agent = self._make_agent(model="x-ai/grok-4.3", tool_use_enforcement="auto") + prompt = agent._build_system_prompt() + assert TOOL_USE_ENFORCEMENT_GUIDANCE in prompt + + def test_auto_injects_execution_guidance_for_grok(self): + """Grok also gets OPENAI_MODEL_EXECUTION_GUIDANCE (verification, + mandatory_tool_use, act_dont_ask). Same failure modes as GPT in + practice — claims completion without tool calls, suggests workarounds + instead of using existing tools. + """ + from agent.prompt_builder import OPENAI_MODEL_EXECUTION_GUIDANCE + agent = self._make_agent(model="x-ai/grok-4.3", tool_use_enforcement="auto") + prompt = agent._build_system_prompt() + assert OPENAI_MODEL_EXECUTION_GUIDANCE in prompt + + def test_auto_injects_execution_guidance_for_xai_oauth_model(self): + """xai-oauth bare model names (no slash) also match the grok pattern.""" + from agent.prompt_builder import OPENAI_MODEL_EXECUTION_GUIDANCE + agent = self._make_agent(model="grok-4.3", tool_use_enforcement="auto") + prompt = agent._build_system_prompt() + assert OPENAI_MODEL_EXECUTION_GUIDANCE in prompt + + def test_auto_does_not_inject_execution_guidance_for_claude(self): + """Sanity: execution guidance stays off for non-targeted families.""" + from agent.prompt_builder import OPENAI_MODEL_EXECUTION_GUIDANCE + agent = self._make_agent( + model="anthropic/claude-sonnet-4", tool_use_enforcement="auto" + ) + prompt = agent._build_system_prompt() + assert OPENAI_MODEL_EXECUTION_GUIDANCE not in prompt + def test_true_forces_for_all_models(self): from agent.prompt_builder import TOOL_USE_ENFORCEMENT_GUIDANCE agent = self._make_agent(model="anthropic/claude-sonnet-4", tool_use_enforcement=True)