fix(agent): add qwen and deepseek to TOOL_USE_ENFORCEMENT_MODELS

Qwen3.x and DeepSeek-V3.x default to chatty/hallucinatory tool use without
enforcement steering — agents narrate "calling tool X" without actually
emitting a tool call, or run partial loops. Both model families fit the
same failure pattern TOOL_USE_ENFORCEMENT_GUIDANCE was already injected
for (gpt, codex, gemini, gemma, grok, glm).

Co-authored-by: briandevans <252620095+briandevans@users.noreply.github.com>

Squashed salvage of:
- 403e567ce fix(agent): add qwen and deepseek to TOOL_USE_ENFORCEMENT_MODELS
- 9433eabe7 test(agent): use realistic qwen-plus identifier in enforcement test

Fixes #28079.
This commit is contained in:
briandevans 2026-05-18 20:06:43 -07:00 committed by Teknium
parent 4229facc01
commit 756900723a
3 changed files with 21 additions and 1 deletions

View file

@ -268,7 +268,7 @@ TOOL_USE_ENFORCEMENT_GUIDANCE = (
# Model name substrings that trigger tool-use enforcement guidance.
# Add new patterns here when a model family needs explicit steering.
TOOL_USE_ENFORCEMENT_MODELS = ("gpt", "codex", "gemini", "gemma", "grok", "glm")
TOOL_USE_ENFORCEMENT_MODELS = ("gpt", "codex", "gemini", "gemma", "grok", "glm", "qwen", "deepseek")
# OpenAI GPT/Codex-specific execution guidance. Addresses known failure modes
# where GPT models abandon work on partial results, skip prerequisite lookups,

View file

@ -1144,6 +1144,12 @@ class TestToolUseEnforcementGuidance:
def test_enforcement_models_includes_grok(self):
assert "grok" in TOOL_USE_ENFORCEMENT_MODELS
def test_enforcement_models_includes_qwen(self):
assert "qwen" in TOOL_USE_ENFORCEMENT_MODELS
def test_enforcement_models_includes_deepseek(self):
assert "deepseek" in TOOL_USE_ENFORCEMENT_MODELS
def test_enforcement_models_is_tuple(self):
assert isinstance(TOOL_USE_ENFORCEMENT_MODELS, tuple)

View file

@ -1103,6 +1103,20 @@ class TestToolUseEnforcementConfig:
prompt = agent._build_system_prompt()
assert TOOL_USE_ENFORCEMENT_GUIDANCE in prompt
def test_auto_injects_for_qwen(self):
"""Qwen models default to chatty/hallucinatory tool use without enforcement."""
from agent.prompt_builder import TOOL_USE_ENFORCEMENT_GUIDANCE
agent = self._make_agent(model="qwen/qwen-plus", tool_use_enforcement="auto")
prompt = agent._build_system_prompt()
assert TOOL_USE_ENFORCEMENT_GUIDANCE in prompt
def test_auto_injects_for_deepseek(self):
"""DeepSeek models default to chatty/hallucinatory tool use without enforcement."""
from agent.prompt_builder import TOOL_USE_ENFORCEMENT_GUIDANCE
agent = self._make_agent(model="deepseek/deepseek-r1", tool_use_enforcement="auto")
prompt = agent._build_system_prompt()
assert TOOL_USE_ENFORCEMENT_GUIDANCE in prompt
def test_auto_injects_execution_guidance_for_grok(self):
"""Grok also gets OPENAI_MODEL_EXECUTION_GUIDANCE (verification,
mandatory_tool_use, act_dont_ask). Same failure modes as GPT in