From 221cd60242ae9ad5bccb4aad6e91e2bc45eb3f6d Mon Sep 17 00:00:00 2001 From: s010mn Date: Wed, 20 May 2026 17:21:19 +0800 Subject: [PATCH] feat: add reasoning_effort support to ollama-cloud provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Map Hermes xhigh→max to unlock DeepSeek V4's 'Max thinking' tier through Ollama Cloud's OpenAI-compatible /v1/chat/completions endpoint. low/medium/high pass through unchanged; disabled/none suppress reasoning entirely. Empirically confirmed: reasoning_effort:max produces ~2.5× more thinking tokens than high on deepseek-v4-pro:cloud (1576 vs 642). --- .../model-providers/ollama-cloud/__init__.py | 63 +++++++- .../test_ollama_cloud_profile.py | 153 ++++++++++++++++++ 2 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 tests/plugins/model_providers/test_ollama_cloud_profile.py diff --git a/plugins/model-providers/ollama-cloud/__init__.py b/plugins/model-providers/ollama-cloud/__init__.py index f25c442a401..7f04cd03ce5 100644 --- a/plugins/model-providers/ollama-cloud/__init__.py +++ b/plugins/model-providers/ollama-cloud/__init__.py @@ -1,9 +1,68 @@ -"""Ollama Cloud provider profile.""" +"""Ollama Cloud provider profile. + +Ollama Cloud's OpenAI-compatible ``/v1/chat/completions`` endpoint +supports top-level ``reasoning_effort`` with values ``none``, ``low``, +``medium``, ``high``, and ``max`` (the last being undocumented but +empirically confirmed for DeepSeek V4 — ``max`` produces ~2.5× more +thinking tokens than ``high``). + +This profile maps Hermes's ``xhigh`` → ``max`` to unlock DeepSeek V4's +"Max thinking" tier through Ollama Cloud. ``low`` / ``medium`` / ``high`` +pass through unchanged. + +When reasoning is explicitly disabled (``enabled: false`` or +``effort: "none"``), ``reasoning_effort`` is omitted entirely so the +model runs in non-thinking mode. +""" + +from __future__ import annotations + +from typing import Any from providers import register_provider from providers.base import ProviderProfile -ollama_cloud = ProviderProfile( + +class OllamaCloudProfile(ProviderProfile): + """Ollama Cloud — maps xhigh→max via top-level reasoning_effort.""" + + def build_api_kwargs_extras( + self, + *, + reasoning_config: dict | None = None, + **ctx: Any, + ) -> tuple[dict[str, Any], dict[str, Any]]: + """Emit top-level ``reasoning_effort`` for Ollama Cloud. + + The ``supports_reasoning`` flag passed by the transport is + deliberately ignored — this profile always handles reasoning + when ``reasoning_config`` is present. + """ + top_level: dict[str, Any] = {} + + if reasoning_config and isinstance(reasoning_config, dict): + enabled = reasoning_config.get("enabled", True) + if enabled is False: + return {}, {} # omit → model runs without thinking + + effort = (reasoning_config.get("effort") or "").strip().lower() + if not effort: + # No explicit effort requested — let the model decide + return {}, {} + if effort == "none": + return {}, {} # explicit none → suppress thinking + if effort in ("xhigh", "max"): + top_level["reasoning_effort"] = "max" + elif effort in ("low", "medium", "high"): + top_level["reasoning_effort"] = effort + else: + # Unknown value — forward as-is, let the API decide + top_level["reasoning_effort"] = effort + + return {}, top_level + + +ollama_cloud = OllamaCloudProfile( name="ollama-cloud", aliases=("ollama_cloud",), default_aux_model="nemotron-3-nano:30b", diff --git a/tests/plugins/model_providers/test_ollama_cloud_profile.py b/tests/plugins/model_providers/test_ollama_cloud_profile.py new file mode 100644 index 00000000000..de1e2be44da --- /dev/null +++ b/tests/plugins/model_providers/test_ollama_cloud_profile.py @@ -0,0 +1,153 @@ +"""Unit tests for the Ollama Cloud provider profile's reasoning-effort wiring. + +Ollama Cloud's ``/v1/chat/completions`` endpoint supports top-level +``reasoning_effort`` with values ``none``, ``low``, ``medium``, ``high``, +and (undocumented but empirically confirmed) ``max``. The profile maps +Hermes's ``xhigh`` → ``max`` to unlock DeepSeek V4's "Max thinking" tier +and passes the standard levels through unchanged. + +These tests pin the profile's wire-shape contract so Ollama Cloud +requests carry the correct ``reasoning_effort`` field. +""" + +from __future__ import annotations + +import pytest + + +@pytest.fixture +def ollama_cloud_profile(): + """Resolve the registered Ollama Cloud profile. + + Going through ``providers.get_provider_profile`` keeps the test + honest — if someone replaces the registered class with a plain + ``ProviderProfile``, every assertion below collapses. + """ + # ``model_tools`` triggers plugin discovery on import, which is what + # registers the Ollama Cloud profile in the global provider registry. + import model_tools # noqa: F401 + import providers + + profile = providers.get_provider_profile("ollama-cloud") + assert profile is not None, "ollama-cloud provider profile must be registered" + return profile + + +class TestOllamaCloudReasoningEffort: + """``build_api_kwargs_extras`` emits correct top-level ``reasoning_effort``.""" + + # ── xhigh / max → max ────────────────────────────────────────── + + @pytest.mark.parametrize("effort", ["xhigh", "max", "MAX", " Max "]) + def test_xhigh_and_max_normalize_to_max(self, ollama_cloud_profile, effort): + extra_body, top_level = ollama_cloud_profile.build_api_kwargs_extras( + reasoning_config={"enabled": True, "effort": effort}, + ) + assert extra_body == {} + assert top_level == {"reasoning_effort": "max"} + + # ── low / medium / high pass through ─────────────────────────── + + @pytest.mark.parametrize("effort", ["low", "medium", "high"]) + def test_standard_efforts_pass_through(self, ollama_cloud_profile, effort): + _, top_level = ollama_cloud_profile.build_api_kwargs_extras( + reasoning_config={"enabled": True, "effort": effort}, + ) + assert top_level == {"reasoning_effort": effort} + + # ── disabled → no reasoning_effort emitted ───────────────────── + + def test_explicitly_disabled_emits_nothing(self, ollama_cloud_profile): + extra_body, top_level = ollama_cloud_profile.build_api_kwargs_extras( + reasoning_config={"enabled": False}, + ) + assert extra_body == {} + assert top_level == {} + + def test_disabled_ignores_effort_field(self, ollama_cloud_profile): + """Effort silently dropped when thinking is off.""" + _, top_level = ollama_cloud_profile.build_api_kwargs_extras( + reasoning_config={"enabled": False, "effort": "high"}, + ) + assert top_level == {} + + # ── none effort → no reasoning_effort ────────────────────────── + + def test_none_effort_emits_nothing(self, ollama_cloud_profile): + extra_body, top_level = ollama_cloud_profile.build_api_kwargs_extras( + reasoning_config={"enabled": True, "effort": "none"}, + ) + assert extra_body == {} + assert top_level == {} + + # ── missing / empty effort → let model default ───────────────── + + def test_no_reasoning_config_emits_nothing(self, ollama_cloud_profile): + extra_body, top_level = ollama_cloud_profile.build_api_kwargs_extras( + reasoning_config=None, + ) + assert extra_body == {} + assert top_level == {} + + def test_empty_effort_emits_nothing(self, ollama_cloud_profile): + _, top_level = ollama_cloud_profile.build_api_kwargs_extras( + reasoning_config={"enabled": True, "effort": ""}, + ) + assert top_level == {} + + def test_no_effort_key_emits_nothing(self, ollama_cloud_profile): + """When effort key is absent, let the model use its default.""" + _, top_level = ollama_cloud_profile.build_api_kwargs_extras( + reasoning_config={"enabled": True}, + ) + assert top_level == {} + + # ── unknown effort → forwarded as-is ─────────────────────────── + + def test_unknown_effort_forwarded(self, ollama_cloud_profile): + _, top_level = ollama_cloud_profile.build_api_kwargs_extras( + reasoning_config={"enabled": True, "effort": "ultra"}, + ) + assert top_level == {"reasoning_effort": "ultra"} + + +class TestOllamaCloudFullKwargsIntegration: + """End-to-end: the transport's full kwargs include reasoning_effort.""" + + def test_full_kwargs_with_xhigh(self, ollama_cloud_profile): + from agent.transports.chat_completions import ChatCompletionsTransport + + kwargs = ChatCompletionsTransport().build_kwargs( + model="deepseek-v4-pro:cloud", + messages=[{"role": "user", "content": "ping"}], + tools=None, + provider_profile=ollama_cloud_profile, + reasoning_config={"enabled": True, "effort": "xhigh"}, + base_url="https://ollama.com/v1", + provider_name="ollama-cloud", + ) + assert kwargs["model"] == "deepseek-v4-pro:cloud" + assert kwargs["reasoning_effort"] == "max" + # No extra_body — Ollama Cloud uses top-level reasoning_effort + assert "extra_body" not in kwargs or "reasoning" not in kwargs.get("extra_body", {}) + + def test_full_kwargs_with_disabled(self, ollama_cloud_profile): + from agent.transports.chat_completions import ChatCompletionsTransport + + kwargs = ChatCompletionsTransport().build_kwargs( + model="deepseek-v4-pro:cloud", + messages=[{"role": "user", "content": "ping"}], + tools=None, + provider_profile=ollama_cloud_profile, + reasoning_config={"enabled": False}, + base_url="https://ollama.com/v1", + provider_name="ollama-cloud", + ) + assert "reasoning_effort" not in kwargs + + +class TestOllamaCloudAuxModel: + """Ollama Cloud aux model is set on the profile.""" + + def test_profile_advertises_aux_model(self, ollama_cloud_profile): + assert ollama_cloud_profile.default_aux_model == "nemotron-3-nano:30b"