feat: add direct endpoint overrides for auxiliary and delegation

Add base_url/api_key overrides for auxiliary tasks and delegation so users can route those flows straight to a custom OpenAI-compatible endpoint without having to rely on provider=main or named custom providers. Also clear gateway session env vars in test isolation so the full suite stays deterministic when run from a messaging-backed agent session.
2026-04-25 00:51:20 +00:00 · 2026-03-14 20:48:29 -07:00 · 2026-03-14 20:48:29 -07:00 · 9f6bccd76a
commit 9f6bccd76a
parent 6c24d76533
12 changed files with 526 additions and 99 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -30,6 +30,10 @@ Default "auto" follows the chains above.
 Per-task model overrides (e.g. AUXILIARY_VISION_MODEL,
 AUXILIARY_WEB_EXTRACT_MODEL) let callers use a different model slug
 than the provider's default.
+
+Per-task direct endpoint overrides (e.g. AUXILIARY_VISION_BASE_URL,
+AUXILIARY_VISION_API_KEY) let callers route a specific auxiliary task to a
+custom OpenAI-compatible endpoint without touching the main model settings.
 """

 import json
@ -418,6 +422,17 @@ def _get_auxiliary_provider(task: str = "") -> str:
    return "auto"


+def _get_auxiliary_env_override(task: str, suffix: str) -> Optional[str]:
+    """Read an auxiliary env override from AUXILIARY_* or CONTEXT_* prefixes."""
+    if not task:
+        return None
+    for prefix in ("AUXILIARY_", "CONTEXT_"):
+        val = os.getenv(f"{prefix}{task.upper()}_{suffix}", "").strip()
+        if val:
+            return val
+    return None
+
+
 def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:
    or_key = os.getenv("OPENROUTER_API_KEY")
    if not or_key:
@ -564,6 +579,8 @@ def resolve_provider_client(
    model: str = None,
    async_mode: bool = False,
    raw_codex: bool = False,
+    explicit_base_url: str = None,
+    explicit_api_key: str = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Central router: given a provider name and optional model, return a
    configured client with the correct auth, base URL, and API format.
@ -585,6 +602,8 @@ def resolve_provider_client(
            instead of wrapping in CodexAuxiliaryClient.  Use this when
            the caller needs direct access to responses.stream() (e.g.,
            the main agent loop).
+        explicit_base_url: Optional direct OpenAI-compatible endpoint.
+        explicit_api_key: Optional API key paired with explicit_base_url.

    Returns:
        (client, resolved_model) or (None, None) if auth is unavailable.
@ -661,6 +680,22 @@ def resolve_provider_client(

    # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
    if provider == "custom":
+        if explicit_base_url:
+            custom_base = explicit_base_url.strip()
+            custom_key = (
+                (explicit_api_key or "").strip()
+                or os.getenv("OPENAI_API_KEY", "").strip()
+            )
+            if not custom_base or not custom_key:
+                logger.warning(
+                    "resolve_provider_client: explicit custom endpoint requested "
+                    "but no API key was found (set explicit_api_key or OPENAI_API_KEY)"
+                )
+                return None, None
+            final_model = model or _read_main_model() or "gpt-4o-mini"
+            client = OpenAI(api_key=custom_key, base_url=custom_base)
+            return (_to_async_client(client, final_model) if async_mode
+                    else (client, final_model))
        # Try custom first, then codex, then API-key providers
        for try_fn in (_try_custom_endpoint, _try_codex,
                       _resolve_api_key_provider):
@ -749,10 +784,13 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona
    Callers may override the returned model with a per-task env var
    (e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
    """
-    forced = _get_auxiliary_provider(task)
-    if forced != "auto":
-        return resolve_provider_client(forced)
-    return resolve_provider_client("auto")
+    provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
+    return resolve_provider_client(
+        provider,
+        model=model,
+        explicit_base_url=base_url,
+        explicit_api_key=api_key,
+    )


 def get_async_text_auxiliary_client(task: str = ""):
@ -762,10 +800,14 @@ def get_async_text_auxiliary_client(task: str = ""):
    (AsyncCodexAuxiliaryClient, model) which wraps the Responses API.
    Returns (None, None) when no provider is available.
    """
-    forced = _get_auxiliary_provider(task)
-    if forced != "auto":
-        return resolve_provider_client(forced, async_mode=True)
-    return resolve_provider_client("auto", async_mode=True)
+    provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
+    return resolve_provider_client(
+        provider,
+        model=model,
+        async_mode=True,
+        explicit_base_url=base_url,
+        explicit_api_key=api_key,
+    )


 _VISION_AUTO_PROVIDER_ORDER = (
@ -821,26 +863,43 @@ def resolve_vision_provider_client(
    provider: Optional[str] = None,
    model: Optional[str] = None,
    *,
+    base_url: Optional[str] = None,
+    api_key: Optional[str] = None,
    async_mode: bool = False,
 ) -> Tuple[Optional[str], Optional[Any], Optional[str]]:
    """Resolve the client actually used for vision tasks.

-    Explicit provider overrides still use the generic provider router for
-    non-standard backends, so users can intentionally force experimental
-    providers. Auto mode stays conservative and only tries vision backends
-    known to work today.
+    Direct endpoint overrides take precedence over provider selection. Explicit
+    provider overrides still use the generic provider router for non-standard
+    backends, so users can intentionally force experimental providers. Auto mode
+    stays conservative and only tries vision backends known to work today.
    """
-    requested = _normalize_vision_provider(provider or _get_auxiliary_provider("vision"))
+    requested, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+        "vision", provider, model, base_url, api_key
+    )
+    requested = _normalize_vision_provider(requested)

    def _finalize(resolved_provider: str, sync_client: Any, default_model: Optional[str]):
        if sync_client is None:
            return resolved_provider, None, None
-        final_model = model or default_model
+        final_model = resolved_model or default_model
        if async_mode:
            async_client, async_model = _to_async_client(sync_client, final_model)
            return resolved_provider, async_client, async_model
        return resolved_provider, sync_client, final_model

+    if resolved_base_url:
+        client, final_model = resolve_provider_client(
+            "custom",
+            model=resolved_model,
+            async_mode=async_mode,
+            explicit_base_url=resolved_base_url,
+            explicit_api_key=resolved_api_key,
+        )
+        if client is None:
+            return "custom", None, None
+        return "custom", client, final_model
+
    if requested == "auto":
        for candidate in get_available_vision_backends():
            sync_client, default_model = _resolve_strict_vision_backend(candidate)
@ -853,7 +912,7 @@ def resolve_vision_provider_client(
        sync_client, default_model = _resolve_strict_vision_backend(requested)
        return _finalize(requested, sync_client, default_model)

-    client, final_model = _get_cached_client(requested, model, async_mode)
+    client, final_model = _get_cached_client(requested, resolved_model, async_mode)
    if client is None:
        return requested, None, None
    return requested, client, final_model
@ -910,19 +969,29 @@ def auxiliary_max_tokens_param(value: int) -> dict:
 # Every auxiliary LLM consumer should use these instead of manually
 # constructing clients and calling .chat.completions.create().

-# Client cache: (provider, async_mode) -> (client, default_model)
+# Client cache: (provider, async_mode, base_url, api_key) -> (client, default_model)
 _client_cache: Dict[tuple, tuple] = {}


 def _get_cached_client(
-    provider: str, model: str = None, async_mode: bool = False,
+    provider: str,
+    model: str = None,
+    async_mode: bool = False,
+    base_url: str = None,
+    api_key: str = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Get or create a cached client for the given provider."""
-    cache_key = (provider, async_mode)
+    cache_key = (provider, async_mode, base_url or "", api_key or "")
    if cache_key in _client_cache:
        cached_client, cached_default = _client_cache[cache_key]
        return cached_client, model or cached_default
-    client, default_model = resolve_provider_client(provider, model, async_mode)
+    client, default_model = resolve_provider_client(
+        provider,
+        model,
+        async_mode,
+        explicit_base_url=base_url,
+        explicit_api_key=api_key,
+    )
    if client is not None:
        _client_cache[cache_key] = (client, default_model)
    return client, model or default_model
@ -932,57 +1001,75 @@ def _resolve_task_provider_model(
    task: str = None,
    provider: str = None,
    model: str = None,
-) -> Tuple[str, Optional[str]]:
+    base_url: str = None,
+    api_key: str = None,
+) -> Tuple[str, Optional[str], Optional[str], Optional[str]]:
    """Determine provider + model for a call.

    Priority:
-      1. Explicit provider/model args (always win)
-      2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.)
-      3. Config file (auxiliary.{task}.provider/model or compression.*)
+      1. Explicit provider/model/base_url/api_key args (always win)
+      2. Env var overrides (AUXILIARY_{TASK}_*, CONTEXT_{TASK}_*)
+      3. Config file (auxiliary.{task}.* or compression.*)
      4. "auto" (full auto-detection chain)

-    Returns (provider, model) where model may be None (use provider default).
+    Returns (provider, model, base_url, api_key) where model may be None
+    (use provider default). When base_url is set, provider is forced to
+    "custom" and the task uses that direct endpoint.
    """
-    if provider:
-        return provider, model
+    config = {}
+    cfg_provider = None
+    cfg_model = None
+    cfg_base_url = None
+    cfg_api_key = None

    if task:
-        # Check env var overrides first
-        env_provider = _get_auxiliary_provider(task)
-        if env_provider != "auto":
-            # Check for env var model override too
-            env_model = None
-            for prefix in ("AUXILIARY_", "CONTEXT_"):
-                val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip()
-                if val:
-                    env_model = val
-                    break
-            return env_provider, model or env_model
-
-        # Read from config file
        try:
            from hermes_cli.config import load_config
            config = load_config()
        except ImportError:
-            return "auto", model
+            config = {}

-        # Check auxiliary.{task} section
-        aux = config.get("auxiliary", {})
-        task_config = aux.get(task, {})
-        cfg_provider = task_config.get("provider", "").strip() or None
-        cfg_model = task_config.get("model", "").strip() or None
+        aux = config.get("auxiliary", {}) if isinstance(config, dict) else {}
+        task_config = aux.get(task, {}) if isinstance(aux, dict) else {}
+        if not isinstance(task_config, dict):
+            task_config = {}
+        cfg_provider = str(task_config.get("provider", "")).strip() or None
+        cfg_model = str(task_config.get("model", "")).strip() or None
+        cfg_base_url = str(task_config.get("base_url", "")).strip() or None
+        cfg_api_key = str(task_config.get("api_key", "")).strip() or None

        # Backwards compat: compression section has its own keys
        if task == "compression" and not cfg_provider:
-            comp = config.get("compression", {})
-            cfg_provider = comp.get("summary_provider", "").strip() or None
-            cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
+            comp = config.get("compression", {}) if isinstance(config, dict) else {}
+            if isinstance(comp, dict):
+                cfg_provider = comp.get("summary_provider", "").strip() or None
+                cfg_model = cfg_model or comp.get("summary_model", "").strip() or None

+    env_model = _get_auxiliary_env_override(task, "MODEL") if task else None
+    resolved_model = model or env_model or cfg_model
+
+    if base_url:
+        return "custom", resolved_model, base_url, api_key
+    if provider:
+        return provider, resolved_model, base_url, api_key
+
+    if task:
+        env_base_url = _get_auxiliary_env_override(task, "BASE_URL")
+        env_api_key = _get_auxiliary_env_override(task, "API_KEY")
+        if env_base_url:
+            return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key
+
+        env_provider = _get_auxiliary_provider(task)
+        if env_provider != "auto":
+            return env_provider, resolved_model, None, None
+
+        if cfg_base_url:
+            return "custom", resolved_model, cfg_base_url, cfg_api_key
        if cfg_provider and cfg_provider != "auto":
-            return cfg_provider, model or cfg_model
-        return "auto", model or cfg_model
+            return cfg_provider, resolved_model, None, None
+        return "auto", resolved_model, None, None

-    return "auto", model
+    return "auto", resolved_model, None, None


 def _build_call_kwargs(
@ -994,6 +1081,7 @@ def _build_call_kwargs(
    tools: Optional[list] = None,
    timeout: float = 30.0,
    extra_body: Optional[dict] = None,
+    base_url: Optional[str] = None,
 ) -> dict:
    """Build kwargs for .chat.completions.create() with model/provider adjustments."""
    kwargs: Dict[str, Any] = {
@ -1009,7 +1097,7 @@ def _build_call_kwargs(
        # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
        # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
        if provider == "custom":
-            custom_base = os.getenv("OPENAI_BASE_URL", "")
+            custom_base = base_url or os.getenv("OPENAI_BASE_URL", "")
            if "api.openai.com" in custom_base.lower():
                kwargs["max_completion_tokens"] = max_tokens
            else:
@ -1035,6 +1123,8 @@ def call_llm(
    *,
    provider: str = None,
    model: str = None,
+    base_url: str = None,
+    api_key: str = None,
    messages: list,
    temperature: float = None,
    max_tokens: int = None,
@ -1066,16 +1156,18 @@ def call_llm(
    Raises:
        RuntimeError: If no provider is configured.
    """
-    resolved_provider, resolved_model = _resolve_task_provider_model(
-        task, provider, model)
+    resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+        task, provider, model, base_url, api_key)

    if task == "vision":
        effective_provider, client, final_model = resolve_vision_provider_client(
-            provider=resolved_provider,
-            model=resolved_model,
+            provider=provider,
+            model=model,
+            base_url=base_url,
+            api_key=api_key,
            async_mode=False,
        )
-        if client is None and resolved_provider != "auto":
+        if client is None and resolved_provider != "auto" and not resolved_base_url:
            logger.warning(
                "Vision provider %s unavailable, falling back to auto vision backends",
                resolved_provider,
@ -1092,10 +1184,15 @@ def call_llm(
            )
        resolved_provider = effective_provider or resolved_provider
    else:
-        client, final_model = _get_cached_client(resolved_provider, resolved_model)
+        client, final_model = _get_cached_client(
+            resolved_provider,
+            resolved_model,
+            base_url=resolved_base_url,
+            api_key=resolved_api_key,
+        )
        if client is None:
            # Fallback: try openrouter
-            if resolved_provider != "openrouter":
+            if resolved_provider != "openrouter" and not resolved_base_url:
                logger.warning("Provider %s unavailable, falling back to openrouter",
                               resolved_provider)
                client, final_model = _get_cached_client(
@ -1108,7 +1205,8 @@ def call_llm(
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
-        tools=tools, timeout=timeout, extra_body=extra_body)
+        tools=tools, timeout=timeout, extra_body=extra_body,
+        base_url=resolved_base_url)

    # Handle max_tokens vs max_completion_tokens retry
    try:
@ -1127,6 +1225,8 @@ async def async_call_llm(
    *,
    provider: str = None,
    model: str = None,
+    base_url: str = None,
+    api_key: str = None,
    messages: list,
    temperature: float = None,
    max_tokens: int = None,
@ -1138,16 +1238,18 @@ async def async_call_llm(

    Same as call_llm() but async. See call_llm() for full documentation.
    """
-    resolved_provider, resolved_model = _resolve_task_provider_model(
-        task, provider, model)
+    resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+        task, provider, model, base_url, api_key)

    if task == "vision":
        effective_provider, client, final_model = resolve_vision_provider_client(
-            provider=resolved_provider,
-            model=resolved_model,
+            provider=provider,
+            model=model,
+            base_url=base_url,
+            api_key=api_key,
            async_mode=True,
        )
-        if client is None and resolved_provider != "auto":
+        if client is None and resolved_provider != "auto" and not resolved_base_url:
            logger.warning(
                "Vision provider %s unavailable, falling back to auto vision backends",
                resolved_provider,
@ -1165,9 +1267,14 @@ async def async_call_llm(
        resolved_provider = effective_provider or resolved_provider
    else:
        client, final_model = _get_cached_client(
-            resolved_provider, resolved_model, async_mode=True)
+            resolved_provider,
+            resolved_model,
+            async_mode=True,
+            base_url=resolved_base_url,
+            api_key=resolved_api_key,
+        )
        if client is None:
-            if resolved_provider != "openrouter":
+            if resolved_provider != "openrouter" and not resolved_base_url:
                logger.warning("Provider %s unavailable, falling back to openrouter",
                               resolved_provider)
                client, final_model = _get_cached_client(
@ -1181,7 +1288,8 @@ async def async_call_llm(
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
-        tools=tools, timeout=timeout, extra_body=extra_body)
+        tools=tools, timeout=timeout, extra_body=extra_body,
+        base_url=resolved_base_url)

    try:
        return await client.chat.completions.create(**kwargs)
--- a/cli.py
+++ b/cli.py
@ -218,11 +218,27 @@ def load_cli_config() -> Dict[str, Any]:
            "timeout": 300,    # Max seconds a sandbox script can run before being killed (5 min)
            "max_tool_calls": 50,  # Max RPC tool calls per execution
        },
+        "auxiliary": {
+            "vision": {
+                "provider": "auto",
+                "model": "",
+                "base_url": "",
+                "api_key": "",
+            },
+            "web_extract": {
+                "provider": "auto",
+                "model": "",
+                "base_url": "",
+                "api_key": "",
+            },
+        },
        "delegation": {
            "max_iterations": 45,  # Max tool-calling turns per child agent
            "default_toolsets": ["terminal", "file", "web"],  # Default toolsets for subagents
            "model": "",       # Subagent model override (empty = inherit parent model)
            "provider": "",    # Subagent provider override (empty = inherit parent provider)
+            "base_url": "",    # Direct OpenAI-compatible endpoint for subagents
+            "api_key": "",     # API key for delegation.base_url (falls back to OPENAI_API_KEY)
        },
    }
    
@ -363,28 +379,44 @@ def load_cli_config() -> Dict[str, Any]:
        if config_key in compression_config:
            os.environ[env_var] = str(compression_config[config_key])
    
-    # Apply auxiliary model overrides to environment variables.
-    # Vision and web_extract each have their own provider + model pair.
+    # Apply auxiliary model/direct-endpoint overrides to environment variables.
+    # Vision and web_extract each have their own provider/model/base_url/api_key tuple.
    # (Compression is handled in the compression section above.)
    # Only set env vars for non-empty / non-default values so auto-detection
    # still works.
    auxiliary_config = defaults.get("auxiliary", {})
    auxiliary_task_env = {
-        # config key → (provider env var, model env var)
-        "vision":      ("AUXILIARY_VISION_PROVIDER",      "AUXILIARY_VISION_MODEL"),
-        "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER",  "AUXILIARY_WEB_EXTRACT_MODEL"),
+        # config key → env var mapping
+        "vision": {
+            "provider": "AUXILIARY_VISION_PROVIDER",
+            "model": "AUXILIARY_VISION_MODEL",
+            "base_url": "AUXILIARY_VISION_BASE_URL",
+            "api_key": "AUXILIARY_VISION_API_KEY",
+        },
+        "web_extract": {
+            "provider": "AUXILIARY_WEB_EXTRACT_PROVIDER",
+            "model": "AUXILIARY_WEB_EXTRACT_MODEL",
+            "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL",
+            "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY",
+        },
    }
    
-    for task_key, (prov_env, model_env) in auxiliary_task_env.items():
+    for task_key, env_map in auxiliary_task_env.items():
        task_cfg = auxiliary_config.get(task_key, {})
        if not isinstance(task_cfg, dict):
            continue
        prov = str(task_cfg.get("provider", "")).strip()
        model = str(task_cfg.get("model", "")).strip()
+        base_url = str(task_cfg.get("base_url", "")).strip()
+        api_key = str(task_cfg.get("api_key", "")).strip()
        if prov and prov != "auto":
-            os.environ[prov_env] = prov
+            os.environ[env_map["provider"]] = prov
        if model:
-            os.environ[model_env] = model
+            os.environ[env_map["model"]] = model
+        if base_url:
+            os.environ[env_map["base_url"]] = base_url
+        if api_key:
+            os.environ[env_map["api_key"]] = api_key
    
    # Security settings
    security_config = defaults.get("security", {})
--- a/gateway/run.py
+++ b/gateway/run.py
@ -100,24 +100,40 @@ if _config_path.exists():
            for _cfg_key, _env_var in _compression_env_map.items():
                if _cfg_key in _compression_cfg:
                    os.environ[_env_var] = str(_compression_cfg[_cfg_key])
-        # Auxiliary model overrides (vision, web_extract).
-        # Each task has provider + model; bridge non-default values to env vars.
+        # Auxiliary model/direct-endpoint overrides (vision, web_extract).
+        # Each task has provider/model/base_url/api_key; bridge non-default values to env vars.
        _auxiliary_cfg = _cfg.get("auxiliary", {})
        if _auxiliary_cfg and isinstance(_auxiliary_cfg, dict):
            _aux_task_env = {
-                "vision":      ("AUXILIARY_VISION_PROVIDER",      "AUXILIARY_VISION_MODEL"),
-                "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER",  "AUXILIARY_WEB_EXTRACT_MODEL"),
+                "vision": {
+                    "provider": "AUXILIARY_VISION_PROVIDER",
+                    "model": "AUXILIARY_VISION_MODEL",
+                    "base_url": "AUXILIARY_VISION_BASE_URL",
+                    "api_key": "AUXILIARY_VISION_API_KEY",
+                },
+                "web_extract": {
+                    "provider": "AUXILIARY_WEB_EXTRACT_PROVIDER",
+                    "model": "AUXILIARY_WEB_EXTRACT_MODEL",
+                    "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL",
+                    "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY",
+                },
            }
-            for _task_key, (_prov_env, _model_env) in _aux_task_env.items():
+            for _task_key, _env_map in _aux_task_env.items():
                _task_cfg = _auxiliary_cfg.get(_task_key, {})
                if not isinstance(_task_cfg, dict):
                    continue
                _prov = str(_task_cfg.get("provider", "")).strip()
                _model = str(_task_cfg.get("model", "")).strip()
+                _base_url = str(_task_cfg.get("base_url", "")).strip()
+                _api_key = str(_task_cfg.get("api_key", "")).strip()
                if _prov and _prov != "auto":
-                    os.environ[_prov_env] = _prov
+                    os.environ[_env_map["provider"]] = _prov
                if _model:
-                    os.environ[_model_env] = _model
+                    os.environ[_env_map["model"]] = _model
+                if _base_url:
+                    os.environ[_env_map["base_url"]] = _base_url
+                if _api_key:
+                    os.environ[_env_map["api_key"]] = _api_key
        _agent_cfg = _cfg.get("agent", {})
        if _agent_cfg and isinstance(_agent_cfg, dict):
            if "max_turns" in _agent_cfg:
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -150,30 +150,44 @@ DEFAULT_CONFIG = {
        "vision": {
            "provider": "auto",    # auto | openrouter | nous | codex | custom
            "model": "",           # e.g. "google/gemini-2.5-flash", "gpt-4o"
+            "base_url": "",        # direct OpenAI-compatible endpoint (takes precedence over provider)
+            "api_key": "",         # API key for base_url (falls back to OPENAI_API_KEY)
        },
        "web_extract": {
            "provider": "auto",
            "model": "",
+            "base_url": "",
+            "api_key": "",
        },
        "compression": {
            "provider": "auto",
            "model": "",
+            "base_url": "",
+            "api_key": "",
        },
        "session_search": {
            "provider": "auto",
            "model": "",
+            "base_url": "",
+            "api_key": "",
        },
        "skills_hub": {
            "provider": "auto",
            "model": "",
+            "base_url": "",
+            "api_key": "",
        },
        "mcp": {
            "provider": "auto",
            "model": "",
+            "base_url": "",
+            "api_key": "",
        },
        "flush_memories": {
            "provider": "auto",
            "model": "",
+            "base_url": "",
+            "api_key": "",
        },
    },
    
@ -243,6 +257,8 @@ DEFAULT_CONFIG = {
    "delegation": {
        "model": "",       # e.g. "google/gemini-3-flash-preview" (empty = inherit parent model)
        "provider": "",    # e.g. "openrouter" (empty = inherit parent provider + credentials)
+        "base_url": "",    # direct OpenAI-compatible endpoint for subagents
+        "api_key": "",     # API key for delegation.base_url (falls back to OPENAI_API_KEY)
    },

    # Ephemeral prefill messages file — JSON list of {role, content} dicts
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@ -24,9 +24,11 @@ def _clean_env(monkeypatch):
    for key in (
        "OPENROUTER_API_KEY", "OPENAI_BASE_URL", "OPENAI_API_KEY",
        "OPENAI_MODEL", "LLM_MODEL", "NOUS_INFERENCE_BASE_URL",
-        # Per-task provider/model overrides
+        # Per-task provider/model/direct-endpoint overrides
        "AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL",
+        "AUXILIARY_VISION_BASE_URL", "AUXILIARY_VISION_API_KEY",
        "AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL",
+        "AUXILIARY_WEB_EXTRACT_BASE_URL", "AUXILIARY_WEB_EXTRACT_API_KEY",
        "CONTEXT_COMPRESSION_PROVIDER", "CONTEXT_COMPRESSION_MODEL",
    ):
        monkeypatch.delenv(key, raising=False)
@ -142,6 +144,27 @@ class TestGetTextAuxiliaryClient:
        call_kwargs = mock_openai.call_args
        assert call_kwargs.kwargs["base_url"] == "http://localhost:1234/v1"

+    def test_task_direct_endpoint_override(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_BASE_URL", "http://localhost:2345/v1")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_API_KEY", "task-key")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_MODEL", "task-model")
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_text_auxiliary_client("web_extract")
+        assert model == "task-model"
+        assert mock_openai.call_args.kwargs["base_url"] == "http://localhost:2345/v1"
+        assert mock_openai.call_args.kwargs["api_key"] == "task-key"
+
+    def test_task_direct_endpoint_without_openai_key_does_not_fall_back(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_BASE_URL", "http://localhost:2345/v1")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_MODEL", "task-model")
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_text_auxiliary_client("web_extract")
+        assert client is None
+        assert model is None
+        mock_openai.assert_not_called()
+
    def test_codex_fallback_when_nothing_else(self, codex_auth_dir):
        with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
             patch("agent.auxiliary_client.OpenAI") as mock_openai:
@ -194,6 +217,27 @@ class TestVisionClientFallback:
            client, model = get_vision_auxiliary_client()
        assert client is not None  # Custom endpoint picked up as fallback

+    def test_vision_direct_endpoint_override(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+        monkeypatch.setenv("AUXILIARY_VISION_BASE_URL", "http://localhost:4567/v1")
+        monkeypatch.setenv("AUXILIARY_VISION_API_KEY", "vision-key")
+        monkeypatch.setenv("AUXILIARY_VISION_MODEL", "vision-model")
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_vision_auxiliary_client()
+        assert model == "vision-model"
+        assert mock_openai.call_args.kwargs["base_url"] == "http://localhost:4567/v1"
+        assert mock_openai.call_args.kwargs["api_key"] == "vision-key"
+
+    def test_vision_direct_endpoint_requires_openai_api_key(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+        monkeypatch.setenv("AUXILIARY_VISION_BASE_URL", "http://localhost:4567/v1")
+        monkeypatch.setenv("AUXILIARY_VISION_MODEL", "vision-model")
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_vision_auxiliary_client()
+        assert client is None
+        assert model is None
+        mock_openai.assert_not_called()
+
    def test_vision_uses_openrouter_when_available(self, monkeypatch):
        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
@ -390,6 +434,24 @@ class TestTaskSpecificOverrides:
            client, model = get_text_auxiliary_client("web_extract")
        assert model == "google/gemini-3-flash-preview"

+    def test_task_direct_endpoint_from_config(self, monkeypatch, tmp_path):
+        hermes_home = tmp_path / "hermes"
+        hermes_home.mkdir(parents=True, exist_ok=True)
+        (hermes_home / "config.yaml").write_text(
+            """auxiliary:
+  web_extract:
+    base_url: http://localhost:3456/v1
+    api_key: config-key
+    model: config-model
+"""
+        )
+        monkeypatch.setenv("HERMES_HOME", str(hermes_home))
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_text_auxiliary_client("web_extract")
+        assert model == "config-model"
+        assert mock_openai.call_args.kwargs["base_url"] == "http://localhost:3456/v1"
+        assert mock_openai.call_args.kwargs["api_key"] == "config-key"
+
    def test_task_without_override_uses_auto(self, monkeypatch):
        """A task with no provider env var falls through to auto chain."""
        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -26,6 +26,12 @@ def _isolate_hermes_home(tmp_path, monkeypatch):
    (fake_home / "memories").mkdir()
    (fake_home / "skills").mkdir()
    monkeypatch.setenv("HERMES_HOME", str(fake_home))
+    # Tests should not inherit the agent's current gateway/messaging surface.
+    # Individual tests that need gateway behavior set these explicitly.
+    monkeypatch.delenv("HERMES_SESSION_PLATFORM", raising=False)
+    monkeypatch.delenv("HERMES_SESSION_CHAT_ID", raising=False)
+    monkeypatch.delenv("HERMES_SESSION_CHAT_NAME", raising=False)
+    monkeypatch.delenv("HERMES_GATEWAY_SESSION", raising=False)


@pytest.fixture()
--- a/tests/test_auxiliary_config_bridge.py
+++ b/tests/test_auxiliary_config_bridge.py
@ -25,7 +25,9 @@ def _run_auxiliary_bridge(config_dict, monkeypatch):
    # Clear env vars
    for key in (
        "AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL",
+        "AUXILIARY_VISION_BASE_URL", "AUXILIARY_VISION_API_KEY",
        "AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL",
+        "AUXILIARY_WEB_EXTRACT_BASE_URL", "AUXILIARY_WEB_EXTRACT_API_KEY",
        "CONTEXT_COMPRESSION_PROVIDER", "CONTEXT_COMPRESSION_MODEL",
    ):
        monkeypatch.delenv(key, raising=False)
@ -47,19 +49,35 @@ def _run_auxiliary_bridge(config_dict, monkeypatch):
    auxiliary_cfg = config_dict.get("auxiliary", {})
    if auxiliary_cfg and isinstance(auxiliary_cfg, dict):
        aux_task_env = {
-            "vision":      ("AUXILIARY_VISION_PROVIDER",      "AUXILIARY_VISION_MODEL"),
-            "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER",  "AUXILIARY_WEB_EXTRACT_MODEL"),
+            "vision": {
+                "provider": "AUXILIARY_VISION_PROVIDER",
+                "model": "AUXILIARY_VISION_MODEL",
+                "base_url": "AUXILIARY_VISION_BASE_URL",
+                "api_key": "AUXILIARY_VISION_API_KEY",
+            },
+            "web_extract": {
+                "provider": "AUXILIARY_WEB_EXTRACT_PROVIDER",
+                "model": "AUXILIARY_WEB_EXTRACT_MODEL",
+                "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL",
+                "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY",
+            },
        }
-        for task_key, (prov_env, model_env) in aux_task_env.items():
+        for task_key, env_map in aux_task_env.items():
            task_cfg = auxiliary_cfg.get(task_key, {})
            if not isinstance(task_cfg, dict):
                continue
            prov = str(task_cfg.get("provider", "")).strip()
            model = str(task_cfg.get("model", "")).strip()
+            base_url = str(task_cfg.get("base_url", "")).strip()
+            api_key = str(task_cfg.get("api_key", "")).strip()
            if prov and prov != "auto":
-                os.environ[prov_env] = prov
+                os.environ[env_map["provider"]] = prov
            if model:
-                os.environ[model_env] = model
+                os.environ[env_map["model"]] = model
+            if base_url:
+                os.environ[env_map["base_url"]] = base_url
+            if api_key:
+                os.environ[env_map["api_key"]] = api_key


 # ── Config bridging tests ────────────────────────────────────────────────────
@ -101,6 +119,21 @@ class TestAuxiliaryConfigBridge:
        assert os.environ.get("AUXILIARY_WEB_EXTRACT_PROVIDER") == "nous"
        assert os.environ.get("AUXILIARY_WEB_EXTRACT_MODEL") == "gemini-2.5-flash"

+    def test_direct_endpoint_bridged(self, monkeypatch):
+        config = {
+            "auxiliary": {
+                "vision": {
+                    "base_url": "http://localhost:1234/v1",
+                    "api_key": "local-key",
+                    "model": "qwen2.5-vl",
+                }
+            }
+        }
+        _run_auxiliary_bridge(config, monkeypatch)
+        assert os.environ.get("AUXILIARY_VISION_BASE_URL") == "http://localhost:1234/v1"
+        assert os.environ.get("AUXILIARY_VISION_API_KEY") == "local-key"
+        assert os.environ.get("AUXILIARY_VISION_MODEL") == "qwen2.5-vl"
+
    def test_compression_provider_bridged(self, monkeypatch):
        config = {
            "compression": {
@ -200,8 +233,12 @@ class TestGatewayBridgeCodeParity:
        # Check for key patterns that indicate the bridge is present
        assert "AUXILIARY_VISION_PROVIDER" in content
        assert "AUXILIARY_VISION_MODEL" in content
+        assert "AUXILIARY_VISION_BASE_URL" in content
+        assert "AUXILIARY_VISION_API_KEY" in content
        assert "AUXILIARY_WEB_EXTRACT_PROVIDER" in content
        assert "AUXILIARY_WEB_EXTRACT_MODEL" in content
+        assert "AUXILIARY_WEB_EXTRACT_BASE_URL" in content
+        assert "AUXILIARY_WEB_EXTRACT_API_KEY" in content

    def test_gateway_has_compression_provider(self):
        """Gateway must bridge compression.summary_provider."""
--- a/tests/tools/test_delegate.py
+++ b/tests/tools/test_delegate.py
@ -10,6 +10,7 @@ Run with:  python -m pytest tests/test_delegate.py -v
 """

 import json
+import os
 import sys
 import unittest
 from unittest.mock import MagicMock, patch
@ -462,6 +463,43 @@ class TestDelegationCredentialResolution(unittest.TestCase):
        self.assertEqual(creds["api_mode"], "chat_completions")
        mock_resolve.assert_called_once_with(requested="openrouter")

+    def test_direct_endpoint_uses_configured_base_url_and_api_key(self):
+        parent = _make_mock_parent(depth=0)
+        cfg = {
+            "model": "qwen2.5-coder",
+            "provider": "openrouter",
+            "base_url": "http://localhost:1234/v1",
+            "api_key": "local-key",
+        }
+        creds = _resolve_delegation_credentials(cfg, parent)
+        self.assertEqual(creds["model"], "qwen2.5-coder")
+        self.assertEqual(creds["provider"], "custom")
+        self.assertEqual(creds["base_url"], "http://localhost:1234/v1")
+        self.assertEqual(creds["api_key"], "local-key")
+        self.assertEqual(creds["api_mode"], "chat_completions")
+
+    def test_direct_endpoint_falls_back_to_openai_api_key_env(self):
+        parent = _make_mock_parent(depth=0)
+        cfg = {
+            "model": "qwen2.5-coder",
+            "base_url": "http://localhost:1234/v1",
+        }
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "env-openai-key"}, clear=False):
+            creds = _resolve_delegation_credentials(cfg, parent)
+        self.assertEqual(creds["api_key"], "env-openai-key")
+        self.assertEqual(creds["provider"], "custom")
+
+    def test_direct_endpoint_does_not_fall_back_to_openrouter_api_key_env(self):
+        parent = _make_mock_parent(depth=0)
+        cfg = {
+            "model": "qwen2.5-coder",
+            "base_url": "http://localhost:1234/v1",
+        }
+        with patch.dict(os.environ, {"OPENROUTER_API_KEY": "env-openrouter-key"}, clear=False):
+            with self.assertRaises(ValueError) as ctx:
+                _resolve_delegation_credentials(cfg, parent)
+        self.assertIn("OPENAI_API_KEY", str(ctx.exception))
+
    @patch("hermes_cli.runtime_provider.resolve_runtime_provider")
    def test_nous_provider_resolves_nous_credentials(self, mock_resolve):
        """Nous provider resolves Nous Portal base_url and api_key."""
@ -589,6 +627,40 @@ class TestDelegationProviderIntegration(unittest.TestCase):
            self.assertNotEqual(kwargs["base_url"], parent.base_url)
            self.assertNotEqual(kwargs["api_key"], parent.api_key)

+    @patch("tools.delegate_tool._load_config")
+    @patch("tools.delegate_tool._resolve_delegation_credentials")
+    def test_direct_endpoint_credentials_reach_child_agent(self, mock_creds, mock_cfg):
+        mock_cfg.return_value = {
+            "max_iterations": 45,
+            "model": "qwen2.5-coder",
+            "base_url": "http://localhost:1234/v1",
+            "api_key": "local-key",
+        }
+        mock_creds.return_value = {
+            "model": "qwen2.5-coder",
+            "provider": "custom",
+            "base_url": "http://localhost:1234/v1",
+            "api_key": "local-key",
+            "api_mode": "chat_completions",
+        }
+        parent = _make_mock_parent(depth=0)
+
+        with patch("run_agent.AIAgent") as MockAgent:
+            mock_child = MagicMock()
+            mock_child.run_conversation.return_value = {
+                "final_response": "done", "completed": True, "api_calls": 1
+            }
+            MockAgent.return_value = mock_child
+
+            delegate_task(goal="Direct endpoint test", parent_agent=parent)
+
+            _, kwargs = MockAgent.call_args
+            self.assertEqual(kwargs["model"], "qwen2.5-coder")
+            self.assertEqual(kwargs["provider"], "custom")
+            self.assertEqual(kwargs["base_url"], "http://localhost:1234/v1")
+            self.assertEqual(kwargs["api_key"], "local-key")
+            self.assertEqual(kwargs["api_mode"], "chat_completions")
+
    @patch("tools.delegate_tool._load_config")
    @patch("tools.delegate_tool._resolve_delegation_credentials")
    def test_empty_config_inherits_parent(self, mock_creds, mock_cfg):
--- a/tools/delegate_tool.py
+++ b/tools/delegate_tool.py
@ -540,18 +540,51 @@ def delegate_task(
 def _resolve_delegation_credentials(cfg: dict, parent_agent) -> dict:
    """Resolve credentials for subagent delegation.

-    If ``delegation.provider`` is configured, resolves the full credential
-    bundle (base_url, api_key, api_mode, provider) via the runtime provider
-    system — the same path used by CLI/gateway startup.  This lets subagents
-    run on a completely different provider:model pair.
+    If ``delegation.base_url`` is configured, subagents use that direct
+    OpenAI-compatible endpoint. Otherwise, if ``delegation.provider`` is
+    configured, the full credential bundle (base_url, api_key, api_mode,
+    provider) is resolved via the runtime provider system — the same path used
+    by CLI/gateway startup. This lets subagents run on a completely different
+    provider:model pair.

-    If no provider is configured, returns None values so the child inherits
-    everything from the parent agent.
+    If neither base_url nor provider is configured, returns None values so the
+    child inherits everything from the parent agent.

    Raises ValueError with a user-friendly message on credential failure.
    """
-    configured_model = cfg.get("model") or None
-    configured_provider = cfg.get("provider") or None
+    configured_model = str(cfg.get("model") or "").strip() or None
+    configured_provider = str(cfg.get("provider") or "").strip() or None
+    configured_base_url = str(cfg.get("base_url") or "").strip() or None
+    configured_api_key = str(cfg.get("api_key") or "").strip() or None
+
+    if configured_base_url:
+        api_key = (
+            configured_api_key
+            or os.getenv("OPENAI_API_KEY", "").strip()
+        )
+        if not api_key:
+            raise ValueError(
+                "Delegation base_url is configured but no API key was found. "
+                "Set delegation.api_key or OPENAI_API_KEY."
+            )
+
+        base_lower = configured_base_url.lower()
+        provider = "custom"
+        api_mode = "chat_completions"
+        if "chatgpt.com/backend-api/codex" in base_lower:
+            provider = "openai-codex"
+            api_mode = "codex_responses"
+        elif "api.anthropic.com" in base_lower:
+            provider = "anthropic"
+            api_mode = "anthropic_messages"
+
+        return {
+            "model": configured_model,
+            "provider": provider,
+            "base_url": configured_base_url,
+            "api_key": api_key,
+            "api_mode": api_mode,
+        }

    if not configured_provider:
        # No provider override — child inherits everything from parent
@ -570,7 +603,8 @@ def _resolve_delegation_credentials(cfg: dict, parent_agent) -> dict:
    except Exception as exc:
        raise ValueError(
            f"Cannot resolve delegation provider '{configured_provider}': {exc}. "
-            f"Check that the provider is configured (API key set, valid provider name). "
+            f"Check that the provider is configured (API key set, valid provider name), "
+            f"or set delegation.base_url/delegation.api_key for a direct endpoint. "
            f"Available providers: openrouter, nous, zai, kimi-coding, minimax."
        ) from exc

--- a/website/docs/reference/environment-variables.md
+++ b/website/docs/reference/environment-variables.md
@ -180,6 +180,23 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe
 | `CONTEXT_COMPRESSION_THRESHOLD` | Trigger at this % of limit (default: 0.50) |
 | `CONTEXT_COMPRESSION_MODEL` | Model for summaries |

+## Auxiliary Task Overrides
+
+| Variable | Description |
+|----------|-------------|
+| `AUXILIARY_VISION_PROVIDER` | Override provider for vision tasks |
+| `AUXILIARY_VISION_MODEL` | Override model for vision tasks |
+| `AUXILIARY_VISION_BASE_URL` | Direct OpenAI-compatible endpoint for vision tasks |
+| `AUXILIARY_VISION_API_KEY` | API key paired with `AUXILIARY_VISION_BASE_URL` |
+| `AUXILIARY_WEB_EXTRACT_PROVIDER` | Override provider for web extraction/summarization |
+| `AUXILIARY_WEB_EXTRACT_MODEL` | Override model for web extraction/summarization |
+| `AUXILIARY_WEB_EXTRACT_BASE_URL` | Direct OpenAI-compatible endpoint for web extraction/summarization |
+| `AUXILIARY_WEB_EXTRACT_API_KEY` | API key paired with `AUXILIARY_WEB_EXTRACT_BASE_URL` |
+| `CONTEXT_COMPRESSION_PROVIDER` | Override provider for context compression summaries |
+| `CONTEXT_COMPRESSION_MODEL` | Override model for context compression summaries |
+
+For task-specific direct endpoints, Hermes uses the task's configured API key or `OPENAI_API_KEY`. It does not reuse `OPENROUTER_API_KEY` for those custom endpoints.
+
 ## Provider Routing (config.yaml only)

 These go in `~/.hermes/config.yaml` under the `provider_routing` section:
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@ -569,11 +569,15 @@ auxiliary:
  vision:
    provider: "auto"           # "auto", "openrouter", "nous", "main"
    model: ""                  # e.g. "openai/gpt-4o", "google/gemini-2.5-flash"
+    base_url: ""               # direct OpenAI-compatible endpoint (takes precedence over provider)
+    api_key: ""                # API key for base_url (falls back to OPENAI_API_KEY)

  # Web page summarization + browser page text extraction
  web_extract:
    provider: "auto"
    model: ""                  # e.g. "google/gemini-2.5-flash"
+    base_url: ""
+    api_key: ""
 ```

 ### Changing the Vision Model
@ -604,6 +608,17 @@ AUXILIARY_VISION_MODEL=openai/gpt-4o

 ### Common Setups

+**Using a direct custom endpoint** (clearer than `provider: "main"` for local/self-hosted APIs):
+```yaml
+auxiliary:
+  vision:
+    base_url: "http://localhost:1234/v1"
+    api_key: "local-key"
+    model: "qwen2.5-vl"
+```
+
+`base_url` takes precedence over `provider`, so this is the most explicit way to route an auxiliary task to a specific endpoint. For direct endpoint overrides, Hermes uses the configured `api_key` or falls back to `OPENAI_API_KEY`; it does not reuse `OPENROUTER_API_KEY` for that custom endpoint.
+
 **Using OpenAI API key for vision:**
 ```yaml
 # In ~/.hermes/.env:
@ -848,13 +863,17 @@ delegation:
    - web
  # model: "google/gemini-3-flash-preview"  # Override model (empty = inherit parent)
  # provider: "openrouter"                  # Override provider (empty = inherit parent)
+  # base_url: "http://localhost:1234/v1"    # Direct OpenAI-compatible endpoint (takes precedence over provider)
+  # api_key: "local-key"                    # API key for base_url (falls back to OPENAI_API_KEY)
 ```

 **Subagent provider:model override:** By default, subagents inherit the parent agent's provider and model. Set `delegation.provider` and `delegation.model` to route subagents to a different provider:model pair — e.g., use a cheap/fast model for narrowly-scoped subtasks while your primary agent runs an expensive reasoning model.

+**Direct endpoint override:** If you want the obvious custom-endpoint path, set `delegation.base_url`, `delegation.api_key`, and `delegation.model`. That sends subagents directly to that OpenAI-compatible endpoint and takes precedence over `delegation.provider`. If `delegation.api_key` is omitted, Hermes falls back to `OPENAI_API_KEY` only.
+
 The delegation provider uses the same credential resolution as CLI/gateway startup. All configured providers are supported: `openrouter`, `nous`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`. When a provider is set, the system automatically resolves the correct base URL, API key, and API mode — no manual credential wiring needed.

-**Precedence:** `delegation.provider` in config → parent provider (inherited). `delegation.model` in config → parent model (inherited). Setting just `model` without `provider` changes only the model name while keeping the parent's credentials (useful for switching models within the same provider like OpenRouter).
+**Precedence:** `delegation.base_url` in config → `delegation.provider` in config → parent provider (inherited). `delegation.model` in config → parent model (inherited). Setting just `model` without `provider` changes only the model name while keeping the parent's credentials (useful for switching models within the same provider like OpenRouter).

 ## Clarify

--- a/website/docs/user-guide/features/delegation.md
+++ b/website/docs/user-guide/features/delegation.md
@ -209,6 +209,14 @@ Delegation has a **depth limit of 2** — a parent (depth 0) can spawn children
 delegation:
  max_iterations: 50                        # Max turns per child (default: 50)
  default_toolsets: ["terminal", "file", "web"]  # Default toolsets
+  model: "google/gemini-3-flash-preview"             # Optional provider/model override
+  provider: "openrouter"                             # Optional built-in provider
+
+# Or use a direct custom endpoint instead of provider:
+delegation:
+  model: "qwen2.5-coder"
+  base_url: "http://localhost:1234/v1"
+  api_key: "local-key"
 ```

 :::tip