From bc15db623145366bf36401ec07091856072438af Mon Sep 17 00:00:00 2001
From: Magicray1217 <magicray1217@users.noreply.github.com>
Date: Sun, 19 Apr 2026 09:17:44 +0800
Subject: [PATCH] docs(docker): add section on connecting to local inference
 servers (vLLM, Ollama)

Adds a comprehensive guide for connecting Dockerized Hermes to local
inference servers like vLLM and Ollama, covering:
- Docker Compose networking (recommended)
- Standalone Docker run with host.docker.internal / --network host
- Connectivity verification steps
- Ollama-specific example

Closes #12308
---
 website/docs/user-guide/docker.md | 133 ++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/website/docs/user-guide/docker.md b/website/docs/user-guide/docker.md
index c780223b5..8eaddd994 100644
--- a/website/docs/user-guide/docker.md
+++ b/website/docs/user-guide/docker.md
@@ -237,6 +237,139 @@ When using Docker as the execution environment (not the methods above, but when
 
 The same syncing happens for SSH and Modal backends — skills and credential files are uploaded via rsync or the Modal mount API before each command.
 
+## Connecting to local inference servers (vLLM, Ollama, etc.)
+
+When running Hermes in Docker and your inference server (vLLM, Ollama, text-generation-inference, etc.) is also running on the host or in another container, networking requires extra attention.
+
+### Docker Compose (recommended)
+
+Put both services on the same Docker network. This is the most reliable approach:
+
+```yaml
+services:
+  vllm:
+    image: vllm/vllm-openai:latest
+    container_name: vllm
+    command: >
+      --model Qwen/Qwen2.5-7B-Instruct
+      --served-model-name my-model
+      --host 0.0.0.0
+      --port 8000
+    ports:
+      - "8000:8000"
+    networks:
+      - hermes-net
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+
+  hermes:
+    image: nousresearch/hermes-agent:latest
+    container_name: hermes
+    restart: unless-stopped
+    command: gateway run
+    ports:
+      - "8642:8642"
+    volumes:
+      - ~/.hermes:/opt/data
+    networks:
+      - hermes-net
+
+networks:
+  hermes-net:
+    driver: bridge
+```
+
+Then in your `~/.hermes/config.yaml`, use the **container name** as the hostname:
+
+```yaml
+model:
+  provider: custom
+  model: my-model
+  base_url: http://vllm:8000/v1
+  api_key: "none"
+```
+
+:::tip Key points
+- Use the **container name** (`vllm`) as the hostname — not `localhost` or `127.0.0.1`, which refer to the Hermes container itself.
+- The `model` value must match the `--served-model-name` you passed to vLLM.
+- Set `api_key` to any non-empty string (vLLM requires the header but doesn't validate it by default).
+- Do **not** include a trailing slash in `base_url`.
+:::
+
+### Standalone Docker run (no Compose)
+
+If your inference server runs directly on the host (not in Docker), use `host.docker.internal` on macOS/Windows, or `--network host` on Linux:
+
+**macOS / Windows:**
+
+```sh
+docker run -d \
+  --name hermes \
+  -v ~/.hermes:/opt/data \
+  -p 8642:8642 \
+  nousresearch/hermes-agent gateway run
+```
+
+```yaml
+# config.yaml
+model:
+  provider: custom
+  model: my-model
+  base_url: http://host.docker.internal:8000/v1
+  api_key: "none"
+```
+
+**Linux (host networking):**
+
+```sh
+docker run -d \
+  --name hermes \
+  --network host \
+  -v ~/.hermes:/opt/data \
+  nousresearch/hermes-agent gateway run
+```
+
+```yaml
+# config.yaml
+model:
+  provider: custom
+  model: my-model
+  base_url: http://127.0.0.1:8000/v1
+  api_key: "none"
+```
+
+:::warning With `--network host`, the `-p` flag is ignored — all container ports are directly exposed on the host.
+:::
+
+### Verifying connectivity
+
+From inside the Hermes container, confirm the inference server is reachable:
+
+```sh
+docker exec hermes curl -s http://vllm:8000/v1/models
+```
+
+You should see a JSON response listing your served model. If this fails, check:
+
+1. Both containers are on the same Docker network (`docker network inspect hermes-net`)
+2. The inference server is listening on `0.0.0.0`, not `127.0.0.1`
+3. The port number matches
+
+### Ollama
+
+Ollama works the same way. If Ollama runs on the host, use `host.docker.internal:11434` (macOS/Windows) or `127.0.0.1:11434` (Linux with `--network host`). If Ollama runs in its own container on the same Docker network:
+
+```yaml
+model:
+  provider: custom
+  model: llama3
+  base_url: http://ollama:11434/v1
+  api_key: "none"
+```
+
 ## Troubleshooting
 
 ### Container exits immediately