From 242962e1f5a0d2a29db7683c01de907369eb2145 Mon Sep 17 00:00:00 2001
From: HwangJohn <angelic805@gmail.com>
Date: Wed, 17 Jun 2026 18:34:40 +0900
Subject: [PATCH] docs(providers): clarify vllm qwen reasoning output

Signed-off-by: HwangJohn <angelic805@gmail.com>

Co-authored-by: OpenAI Codex <codex@openai.com>
---
 cli-config.yaml.example                |  4 ++++
 website/docs/integrations/providers.md | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/cli-config.yaml.example b/cli-config.yaml.example
index 942b3252e21..b6eb191b2a7 100644
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -483,6 +483,10 @@ prompt_caching:
 #                           # reasoning controls:
 #                           # extra_body:
 #                           #   enable_thinking: false
+#                           # Some vLLM/Qwen deployments expect this nested:
+#                           # extra_body:
+#                           #   chat_template_kwargs:
+#                           #     enable_thinking: false
 
 # =============================================================================
 # Persistent Memory
diff --git a/website/docs/integrations/providers.md b/website/docs/integrations/providers.md
index 6ab24d0a421..46d7958cc42 100644
--- a/website/docs/integrations/providers.md
+++ b/website/docs/integrations/providers.md
@@ -792,6 +792,8 @@ hermes model
 
 Supported parsers: `hermes` (Qwen 2.5, Hermes 2/3), `llama3_json` (Llama 3.x), `mistral`, `deepseek_v3`, `deepseek_v31`, `xlam`, `pythonic`. Without these flags, tool calls won't work — the model will output tool calls as text.
 
+**Qwen reasoning parsers:** Hermes preserves structured reasoning metadata such as `reasoning`, `reasoning_content`, and streamed reasoning deltas when OpenAI-compatible servers return them. That metadata is treated as reasoning/thinking trace data, not as a replacement for the assistant's visible answer. For Qwen reasoning models served by vLLM, make sure the final user-visible response still appears in `content`. If `--reasoning-parser qwen3` leaves `content` empty in your deployment, either disable that parser or pass a server-supported request option such as `chat_template_kwargs.enable_thinking: false` through `extra_body`.
+
 :::tip
 vLLM supports human-readable sizes: `--max-model-len 64k` (lowercase k = 1000, uppercase K = 1024).
 :::
@@ -1272,6 +1274,14 @@ extra_body:
     enable_thinking: true
 ```
 
+For Qwen reasoning models served by vLLM, this same shape can be used to disable thinking when a reasoning parser separates all generated text into reasoning fields and leaves the assistant `content` empty:
+
+```yaml
+extra_body:
+  chat_template_kwargs:
+    enable_thinking: false
+```
+
 The `hermes model` → Custom Endpoint wizard now prompts for `api_mode` explicitly and persists your answer to `config.yaml`. URL-based auto-detection (e.g. `/anthropic` paths → `anthropic_messages`) still happens as a fallback when the field is left blank.
 
 **Native vision for custom-provider models.** If your custom endpoint serves a vision-capable model that isn't in models.dev, set `model.supports_vision: true` so Hermes routes attached images natively (as `image_url` parts) instead of pre-processing them through `vision_analyze`. Single knob — no need to also set `agent.image_input_mode: native`.