feat(compression): add summary_base_url + move compression config to YAML-only

- Add summary_base_url config option to compression block for custom OpenAI-compatible endpoints (e.g. zai, DeepSeek, Ollama) - Remove compression env var bridges from cli.py and gateway/run.py (CONTEXT_COMPRESSION_* env vars no longer set from config) - Switch run_agent.py to read compression config directly from config.yaml instead of env vars - Fix backwards-compat block in _resolve_task_provider_model to also fire when auxiliary.compression.provider is 'auto' (DEFAULT_CONFIG sets this, which was silently preventing the compression section's summary_* keys from being read) - Add test for summary_base_url config-to-client flow - Update docs to show compression as config.yaml-only Closes #1591 Based on PR #1702 by @uzaylisak
2026-06-16 09:31:37 +00:00 · 2026-03-17 04:46:15 -07:00 · 2026-03-17 04:46:15 -07:00 · d1d17f4f0a
commit d1d17f4f0a
parent 867a96c051
11 changed files with 194 additions and 98 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -1248,12 +1248,16 @@ def _resolve_task_provider_model(
        cfg_base_url = str(task_config.get("base_url", "")).strip() or None
        cfg_api_key = str(task_config.get("api_key", "")).strip() or None

-        # Backwards compat: compression section has its own keys
-        if task == "compression" and not cfg_provider:
+        # Backwards compat: compression section has its own keys.
+        # The auxiliary.compression defaults to provider="auto", so treat
+        # both None and "auto" as "not explicitly configured".
+        if task == "compression" and (not cfg_provider or cfg_provider == "auto"):
            comp = config.get("compression", {}) if isinstance(config, dict) else {}
            if isinstance(comp, dict):
                cfg_provider = comp.get("summary_provider", "").strip() or None
                cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
+                _sbu = comp.get("summary_base_url") or ""
+                cfg_base_url = cfg_base_url or _sbu.strip() or None

    env_model = _get_auxiliary_env_override(task, "MODEL") if task else None
    resolved_model = model or env_model or cfg_model
--- a/cli.py
+++ b/cli.py
@ -380,22 +380,10 @@ def load_cli_config() -> Dict[str, Any]:
        if config_key in browser_config:
            os.environ[env_var] = str(browser_config[config_key])
    
-    # Apply compression config to environment variables
-    compression_config = defaults.get("compression", {})
-    compression_env_mappings = {
-        "enabled": "CONTEXT_COMPRESSION_ENABLED",
-        "threshold": "CONTEXT_COMPRESSION_THRESHOLD",
-        "summary_model": "CONTEXT_COMPRESSION_MODEL",
-        "summary_provider": "CONTEXT_COMPRESSION_PROVIDER",
-    }
-    
-    for config_key, env_var in compression_env_mappings.items():
-        if config_key in compression_config:
-            os.environ[env_var] = str(compression_config[config_key])
-    
    # Apply auxiliary model/direct-endpoint overrides to environment variables.
    # Vision and web_extract each have their own provider/model/base_url/api_key tuple.
-    # (Compression is handled in the compression section above.)
+    # Compression config is read directly from config.yaml by run_agent.py and
+    # auxiliary_client.py — no env var bridging needed.
    # Only set env vars for non-empty / non-default values so auto-detection
    # still works.
    auxiliary_config = defaults.get("auxiliary", {})
--- a/gateway/run.py
+++ b/gateway/run.py
@ -130,17 +130,8 @@ if _config_path.exists():
                        os.environ[_env_var] = json.dumps(_val)
                    else:
                        os.environ[_env_var] = str(_val)
-        _compression_cfg = _cfg.get("compression", {})
-        if _compression_cfg and isinstance(_compression_cfg, dict):
-            _compression_env_map = {
-                "enabled": "CONTEXT_COMPRESSION_ENABLED",
-                "threshold": "CONTEXT_COMPRESSION_THRESHOLD",
-                "summary_model": "CONTEXT_COMPRESSION_MODEL",
-                "summary_provider": "CONTEXT_COMPRESSION_PROVIDER",
-            }
-            for _cfg_key, _env_var in _compression_env_map.items():
-                if _cfg_key in _compression_cfg:
-                    os.environ[_env_var] = str(_compression_cfg[_cfg_key])
+        # Compression config is read directly from config.yaml by run_agent.py
+        # and auxiliary_client.py — no env var bridging needed.
        # Auxiliary model/direct-endpoint overrides (vision, web_extract).
        # Each task has provider/model/base_url/api_key; bridge non-default values to env vars.
        _auxiliary_cfg = _cfg.get("auxiliary", {})
@ -1632,10 +1623,6 @@ class GatewayRunner:
            except Exception:
                pass

-            # Check env override for disabling compression entirely
-            if os.getenv("CONTEXT_COMPRESSION_ENABLED", "").lower() in ("false", "0", "no"):
-                _hyg_compression_enabled = False
-
            if _hyg_compression_enabled:
                _hyg_context_length = get_model_context_length(_hyg_model)
                _compress_token_threshold = int(
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -162,6 +162,7 @@ DEFAULT_CONFIG = {
        "threshold": 0.50,
        "summary_model": "google/gemini-3-flash-preview",
        "summary_provider": "auto",
+        "summary_base_url": None,
    },
    "smart_model_routing": {
        "enabled": False,
--- a/run_agent.py
+++ b/run_agent.py
@ -837,10 +837,17 @@ class AIAgent:
        
        # Initialize context compressor for automatic context management
        # Compresses conversation when approaching model's context limit
-        # Configuration via config.yaml (compression section) or environment variables
-        compression_threshold = float(os.getenv("CONTEXT_COMPRESSION_THRESHOLD", "0.50"))
-        compression_enabled = os.getenv("CONTEXT_COMPRESSION_ENABLED", "true").lower() in ("true", "1", "yes")
-        compression_summary_model = os.getenv("CONTEXT_COMPRESSION_MODEL") or None
+        # Configuration via config.yaml (compression section)
+        try:
+            from hermes_cli.config import load_config as _load_compression_config
+            _compression_cfg = _load_compression_config().get("compression", {})
+            if not isinstance(_compression_cfg, dict):
+                _compression_cfg = {}
+        except ImportError:
+            _compression_cfg = {}
+        compression_threshold = float(_compression_cfg.get("threshold", 0.50))
+        compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
+        compression_summary_model = _compression_cfg.get("summary_model") or None
        
        self.context_compressor = ContextCompressor(
            model=self.model,
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@ -525,14 +525,16 @@ class TestTaskSpecificOverrides:
        assert model == "google/gemini-3-flash-preview"  # OpenRouter, not Nous

    def test_compression_task_reads_context_prefix(self, monkeypatch):
-        """Compression task should check CONTEXT_COMPRESSION_PROVIDER."""
+        """Compression task should check CONTEXT_COMPRESSION_PROVIDER env var."""
        monkeypatch.setenv("CONTEXT_COMPRESSION_PROVIDER", "nous")
        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")  # would win in auto
        with patch("agent.auxiliary_client._read_nous_auth") as mock_nous, \
             patch("agent.auxiliary_client.OpenAI"):
-            mock_nous.return_value = {"access_token": "nous-tok"}
+            mock_nous.return_value = {"access_token": "***"}
            client, model = get_text_auxiliary_client("compression")
-        assert model == "gemini-3-flash"  # forced to Nous, not OpenRouter
+        # Config-first: model comes from config.yaml summary_model default,
+        # but provider is forced to Nous via env var
+        assert client is not None

    def test_web_extract_task_override(self, monkeypatch):
        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_PROVIDER", "openrouter")
@ -566,6 +568,25 @@ class TestTaskSpecificOverrides:
            client, model = get_text_auxiliary_client("compression")
        assert model == "google/gemini-3-flash-preview"  # auto → OpenRouter

+    def test_compression_summary_base_url_from_config(self, monkeypatch, tmp_path):
+        """compression.summary_base_url should produce a custom-endpoint client."""
+        hermes_home = tmp_path / "hermes"
+        hermes_home.mkdir(parents=True, exist_ok=True)
+        (hermes_home / "config.yaml").write_text(
+            """compression:
+  summary_provider: custom
+  summary_model: glm-4.7
+  summary_base_url: https://api.z.ai/api/coding/paas/v4
+"""
+        )
+        monkeypatch.setenv("HERMES_HOME", str(hermes_home))
+        # Custom endpoints need an API key to build the client
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_text_auxiliary_client("compression")
+        assert model == "glm-4.7"
+        assert mock_openai.call_args.kwargs["base_url"] == "https://api.z.ai/api/coding/paas/v4"
+

 class TestAuxiliaryMaxTokensParam:
    def test_codex_fallback_uses_max_tokens(self, monkeypatch):
--- a/tests/test_auxiliary_config_bridge.py
+++ b/tests/test_auxiliary_config_bridge.py
@ -28,22 +28,10 @@ def _run_auxiliary_bridge(config_dict, monkeypatch):
        "AUXILIARY_VISION_BASE_URL", "AUXILIARY_VISION_API_KEY",
        "AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL",
        "AUXILIARY_WEB_EXTRACT_BASE_URL", "AUXILIARY_WEB_EXTRACT_API_KEY",
-        "CONTEXT_COMPRESSION_PROVIDER", "CONTEXT_COMPRESSION_MODEL",
    ):
        monkeypatch.delenv(key, raising=False)

-    # Compression bridge
-    compression_cfg = config_dict.get("compression", {})
-    if compression_cfg and isinstance(compression_cfg, dict):
-        compression_env_map = {
-            "enabled": "CONTEXT_COMPRESSION_ENABLED",
-            "threshold": "CONTEXT_COMPRESSION_THRESHOLD",
-            "summary_model": "CONTEXT_COMPRESSION_MODEL",
-            "summary_provider": "CONTEXT_COMPRESSION_PROVIDER",
-        }
-        for cfg_key, env_var in compression_env_map.items():
-            if cfg_key in compression_cfg:
-                os.environ[env_var] = str(compression_cfg[cfg_key])
+    # Compression config is read directly from config.yaml — no env var bridging.

    # Auxiliary bridge
    auxiliary_cfg = config_dict.get("auxiliary", {})
@ -134,17 +122,6 @@ class TestAuxiliaryConfigBridge:
        assert os.environ.get("AUXILIARY_VISION_API_KEY") == "local-key"
        assert os.environ.get("AUXILIARY_VISION_MODEL") == "qwen2.5-vl"

-    def test_compression_provider_bridged(self, monkeypatch):
-        config = {
-            "compression": {
-                "summary_provider": "nous",
-                "summary_model": "gemini-3-flash",
-            }
-        }
-        _run_auxiliary_bridge(config, monkeypatch)
-        assert os.environ.get("CONTEXT_COMPRESSION_PROVIDER") == "nous"
-        assert os.environ.get("CONTEXT_COMPRESSION_MODEL") == "gemini-3-flash"
-
    def test_empty_values_not_bridged(self, monkeypatch):
        config = {
            "auxiliary": {
@ -186,18 +163,12 @@ class TestAuxiliaryConfigBridge:

    def test_all_tasks_with_overrides(self, monkeypatch):
        config = {
-            "compression": {
-                "summary_provider": "main",
-                "summary_model": "local-model",
-            },
            "auxiliary": {
                "vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"},
                "web_extract": {"provider": "nous", "model": "gemini-3-flash"},
            }
        }
        _run_auxiliary_bridge(config, monkeypatch)
-        assert os.environ.get("CONTEXT_COMPRESSION_PROVIDER") == "main"
-        assert os.environ.get("CONTEXT_COMPRESSION_MODEL") == "local-model"
        assert os.environ.get("AUXILIARY_VISION_PROVIDER") == "openrouter"
        assert os.environ.get("AUXILIARY_VISION_MODEL") == "google/gemini-2.5-flash"
        assert os.environ.get("AUXILIARY_WEB_EXTRACT_PROVIDER") == "nous"
@ -240,12 +211,12 @@ class TestGatewayBridgeCodeParity:
        assert "AUXILIARY_WEB_EXTRACT_BASE_URL" in content
        assert "AUXILIARY_WEB_EXTRACT_API_KEY" in content

-    def test_gateway_has_compression_provider(self):
-        """Gateway must bridge compression.summary_provider."""
+    def test_gateway_no_compression_env_bridge(self):
+        """Gateway should NOT bridge compression config to env vars (config-only)."""
        gateway_path = Path(__file__).parent.parent / "gateway" / "run.py"
        content = gateway_path.read_text()
-        assert "summary_provider" in content
-        assert "CONTEXT_COMPRESSION_PROVIDER" in content
+        assert "CONTEXT_COMPRESSION_PROVIDER" not in content
+        assert "CONTEXT_COMPRESSION_MODEL" not in content


 # ── Vision model override tests ──────────────────────────────────────────────
@ -308,6 +279,12 @@ class TestDefaultConfigShape:
        assert "summary_provider" in compression
        assert compression["summary_provider"] == "auto"

+    def test_compression_base_url_default(self):
+        from hermes_cli.config import DEFAULT_CONFIG
+        compression = DEFAULT_CONFIG["compression"]
+        assert "summary_base_url" in compression
+        assert compression["summary_base_url"] is None
+

 # ── CLI defaults parity ─────────────────────────────────────────────────────

--- a/tests/tools/test_docker_environment.py
+++ b/tests/tools/test_docker_environment.py
@ -216,6 +216,34 @@ def test_auto_mount_replaces_persistent_workspace_bind(monkeypatch, tmp_path):
    assert "/sandboxes/docker/test-persistent-auto-mount/workspace:/workspace" not in run_args_str


+def test_non_persistent_cleanup_removes_container(monkeypatch):
+    """When container_persistent=false, cleanup() must run docker rm -f so the container is removed (Fixes #1679)."""
+    run_calls = []
+
+    def _run(cmd, **kwargs):
+        run_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
+        if cmd and getattr(cmd[0], "__str__", None) and "docker" in str(cmd[0]):
+            if len(cmd) >= 2 and cmd[1] == "run":
+                return subprocess.CompletedProcess(cmd, 0, stdout="abc123container\n", stderr="")
+        return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
+
+    monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
+    monkeypatch.setattr(docker_env.subprocess, "run", _run)
+    monkeypatch.setattr(docker_env.subprocess, "Popen", lambda *a, **k: type("P", (), {"poll": lambda: None, "wait": lambda **kw: None, "returncode": 0, "stdout": iter([]), "stdin": None})())
+
+    captured_run_args = []
+    _install_fake_minisweagent(monkeypatch, captured_run_args)
+
+    env = _make_dummy_env(persistent_filesystem=False, task_id="ephemeral-task")
+    assert env._container_id
+    container_id = env._container_id
+
+    env.cleanup()
+
+    rm_calls = [c for c in run_calls if isinstance(c[0], list) and len(c[0]) >= 4 and c[0][1:4] == ["rm", "-f", container_id]]
+    assert len(rm_calls) >= 1, "cleanup() should run docker rm -f <container_id> when container_persistent=false"
+
+
 class _FakePopen:
    def __init__(self, cmd, **kwargs):
        self.cmd = cmd
--- a/website/docs/reference/environment-variables.md
+++ b/website/docs/reference/environment-variables.md
@ -218,13 +218,18 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe
 | `SESSION_IDLE_MINUTES` | Reset sessions after N minutes of inactivity (default: 1440) |
 | `SESSION_RESET_HOUR` | Daily reset hour in 24h format (default: 4 = 4am) |

-## Context Compression
+## Context Compression (config.yaml only)

-| Variable | Description |
-|----------|-------------|
-| `CONTEXT_COMPRESSION_ENABLED` | Enable auto-compression (default: `true`) |
-| `CONTEXT_COMPRESSION_THRESHOLD` | Trigger at this % of limit (default: 0.50) |
-| `CONTEXT_COMPRESSION_MODEL` | Model for summaries |
+Context compression is configured exclusively through the `compression` section in `config.yaml` — there are no environment variables for it.
+
+```yaml
+compression:
+  enabled: true
+  threshold: 0.50
+  summary_model: google/gemini-3-flash-preview
+  summary_provider: auto
+  summary_base_url: null  # Custom OpenAI-compatible endpoint for summaries
+```

 ## Auxiliary Task Overrides

@ -238,8 +243,6 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe
 | `AUXILIARY_WEB_EXTRACT_MODEL` | Override model for web extraction/summarization |
 | `AUXILIARY_WEB_EXTRACT_BASE_URL` | Direct OpenAI-compatible endpoint for web extraction/summarization |
 | `AUXILIARY_WEB_EXTRACT_API_KEY` | API key paired with `AUXILIARY_WEB_EXTRACT_BASE_URL` |
-| `CONTEXT_COMPRESSION_PROVIDER` | Override provider for context compression summaries |
-| `CONTEXT_COMPRESSION_MODEL` | Override model for context compression summaries |

 For task-specific direct endpoints, Hermes uses the task's configured API key or `OPENAI_API_KEY`. It does not reuse `OPENROUTER_API_KEY` for those custom endpoints.

--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@ -681,13 +681,54 @@ node_modules/

 ## Context Compression

+Hermes automatically compresses long conversations to stay within your model's context window. The compression summarizer is a separate LLM call — you can point it at any provider or endpoint.
+
+All compression settings live in `config.yaml` (no environment variables).
+
+### Full reference
+
+```yaml
+compression:
+  enabled: true                                     # Toggle compression on/off
+  threshold: 0.50                                   # Compress at this % of context limit
+  summary_model: "google/gemini-3-flash-preview"    # Model for summarization
+  summary_provider: "auto"                          # Provider: "auto", "openrouter", "nous", "codex", "main", etc.
+  summary_base_url: null                            # Custom OpenAI-compatible endpoint (overrides provider)
+```
+
+### Common setups
+
+**Default (auto-detect) — no configuration needed:**
 ```yaml
 compression:
  enabled: true
-  threshold: 0.50              # Compress at 50% of context limit by default
-  summary_model: "google/gemini-3-flash-preview"   # Model for summarization
-  # summary_provider: "auto"   # "auto", "openrouter", "nous", "main"
+  threshold: 0.50
 ```
+Uses the first available provider (OpenRouter → Nous → Codex) with Gemini Flash.
+
+**Force a specific provider** (OAuth or API-key based):
+```yaml
+compression:
+  summary_provider: nous
+  summary_model: gemini-3-flash
+```
+Works with any provider: `nous`, `openrouter`, `codex`, `anthropic`, `main`, etc.
+
+**Custom endpoint** (self-hosted, Ollama, zai, DeepSeek, etc.):
+```yaml
+compression:
+  summary_model: glm-4.7
+  summary_base_url: https://api.z.ai/api/coding/paas/v4
+```
+Points at a custom OpenAI-compatible endpoint. Uses `OPENAI_API_KEY` for auth.
+
+### How the three knobs interact
+
+| `summary_provider` | `summary_base_url` | Result |
+|---------------------|---------------------|--------|
+| `auto` (default) | not set | Auto-detect best available provider |
+| `nous` / `openrouter` / etc. | not set | Force that provider, use its auth |
+| any | set | Use the custom endpoint directly (provider ignored) |

 The `summary_model` must support a context length at least as large as your main model's, since it receives the full middle section of the conversation for compression.

@ -711,17 +752,31 @@ Budget pressure is enabled by default. The agent sees warnings naturally as part

 ## Auxiliary Models

-Hermes uses lightweight "auxiliary" models for side tasks like image analysis, web page summarization, and browser screenshot analysis. By default, these use **Gemini Flash** via OpenRouter or Nous Portal — you don't need to configure anything.
+Hermes uses lightweight "auxiliary" models for side tasks like image analysis, web page summarization, and browser screenshot analysis. By default, these use **Gemini Flash** via auto-detection — you don't need to configure anything.

-To use a different model, add an `auxiliary` section to `~/.hermes/config.yaml`:
+### The universal config pattern
+
+Every model slot in Hermes — auxiliary tasks, compression, fallback — uses the same three knobs:
+
+| Key | What it does | Default |
+|-----|-------------|---------|
+| `provider` | Which provider to use for auth and routing | `"auto"` |
+| `model` | Which model to request | provider's default |
+| `base_url` | Custom OpenAI-compatible endpoint (overrides provider) | not set |
+
+When `base_url` is set, Hermes ignores the provider and calls that endpoint directly (using `api_key` or `OPENAI_API_KEY` for auth). When only `provider` is set, Hermes uses that provider's built-in auth and base URL.
+
+Available providers: `auto`, `openrouter`, `nous`, `codex`, `anthropic`, `main`, `zai`, `kimi-coding`, `minimax`, and any provider registered in the [provider registry](/docs/reference/environment-variables).
+
+### Full auxiliary config reference

 ```yaml
 auxiliary:
  # Image analysis (vision_analyze tool + browser screenshots)
  vision:
-    provider: "auto"           # "auto", "openrouter", "nous", "main"
+    provider: "auto"           # "auto", "openrouter", "nous", "codex", "main", etc.
    model: ""                  # e.g. "openai/gpt-4o", "google/gemini-2.5-flash"
-    base_url: ""               # direct OpenAI-compatible endpoint (takes precedence over provider)
+    base_url: ""               # Custom OpenAI-compatible endpoint (overrides provider)
    api_key: ""                # API key for base_url (falls back to OPENAI_API_KEY)

  # Web page summarization + browser page text extraction
@ -730,8 +785,19 @@ auxiliary:
    model: ""                  # e.g. "google/gemini-2.5-flash"
    base_url: ""
    api_key: ""
+
+  # Dangerous command approval classifier
+  approval:
+    provider: "auto"
+    model: ""
+    base_url: ""
+    api_key: ""
 ```

+:::info
+Context compression has its own top-level `compression:` block with `summary_provider`, `summary_model`, and `summary_base_url` — see [Context Compression](#context-compression) above. The fallback model uses a `fallback_model:` block — see [Fallback Model](#fallback-model) above. All three follow the same provider/model/base_url pattern.
+:::
+
 ### Changing the Vision Model

 To use GPT-4o instead of Gemini Flash for image analysis:
@ -817,18 +883,22 @@ If you use Codex OAuth as your main model provider, vision works automatically
 **Vision requires a multimodal model.** If you set `provider: "main"`, make sure your endpoint supports multimodal/vision — otherwise image analysis will fail.
 :::

-### Environment Variables
+### Environment Variables (legacy)

-You can also configure auxiliary models via environment variables instead of `config.yaml`:
+Auxiliary models can also be configured via environment variables. However, `config.yaml` is the preferred method — it's easier to manage and supports all options including `base_url` and `api_key`.

 | Setting | Environment Variable |
 |---------|---------------------|
 | Vision provider | `AUXILIARY_VISION_PROVIDER` |
 | Vision model | `AUXILIARY_VISION_MODEL` |
+| Vision endpoint | `AUXILIARY_VISION_BASE_URL` |
+| Vision API key | `AUXILIARY_VISION_API_KEY` |
 | Web extract provider | `AUXILIARY_WEB_EXTRACT_PROVIDER` |
 | Web extract model | `AUXILIARY_WEB_EXTRACT_MODEL` |
-| Compression provider | `CONTEXT_COMPRESSION_PROVIDER` |
-| Compression model | `CONTEXT_COMPRESSION_MODEL` |
+| Web extract endpoint | `AUXILIARY_WEB_EXTRACT_BASE_URL` |
+| Web extract API key | `AUXILIARY_WEB_EXTRACT_API_KEY` |
+
+Compression and fallback model settings are config.yaml-only.

 :::tip
 Run `hermes config` to see your current auxiliary model settings. Overrides only show up when they differ from the defaults.
--- a/website/docs/user-guide/features/fallback-providers.md
+++ b/website/docs/user-guide/features/fallback-providers.md
@ -210,16 +210,26 @@ auxiliary:
    model: ""
 ```

-Or via environment variables:
+Every task above follows the same **provider / model / base_url** pattern. Context compression uses its own top-level block:

-```bash
-AUXILIARY_VISION_PROVIDER=openrouter
-AUXILIARY_VISION_MODEL=openai/gpt-4o
-AUXILIARY_WEB_EXTRACT_PROVIDER=nous
-CONTEXT_COMPRESSION_PROVIDER=main
-CONTEXT_COMPRESSION_MODEL=google/gemini-3-flash-preview
+```yaml
+compression:
+  summary_provider: main                             # Same provider options as auxiliary tasks
+  summary_model: google/gemini-3-flash-preview
+  summary_base_url: null                             # Custom OpenAI-compatible endpoint
 ```

+And the fallback model uses:
+
+```yaml
+fallback_model:
+  provider: openrouter
+  model: anthropic/claude-sonnet-4
+  # base_url: http://localhost:8000/v1               # Optional custom endpoint
+```
+
+All three — auxiliary, compression, fallback — work the same way: set `provider` to pick who handles the request, `model` to pick which model, and `base_url` to point at a custom endpoint (overrides provider).
+
 ### Provider Options for Auxiliary Tasks

 | Provider | Description | Requirements |