fix(kimi): omit temperature entirely for Kimi/Moonshot models (#13157)

Kimi's gateway selects the correct temperature server-side based on the active mode (thinking -> 1.0, non-thinking -> 0.6). Sending any temperature value — even the previously "correct" one — conflicts with gateway-managed defaults. Replaces the old approach of forcing specific temperature values (0.6 for non-thinking, 1.0 for thinking) with an OMIT_TEMPERATURE sentinel that tells all call sites to strip the temperature key from API kwargs entirely. Changes: - agent/auxiliary_client.py: OMIT_TEMPERATURE sentinel, _is_kimi_model() prefix check (covers all kimi-* models), _fixed_temperature_for_model() returns sentinel for kimi models. _build_call_kwargs() strips temp. - run_agent.py: _build_api_kwargs, flush_memories, and summary generation paths all handle the sentinel by popping/omitting temperature. - trajectory_compressor.py: _effective_temperature_for_model returns None for kimi (sentinel mapped), direct client calls use kwargs dict to conditionally include temperature. - mini_swe_runner.py: same sentinel handling via wrapper function. - 6 test files updated: all 'forces temperature X' assertions replaced with 'temperature not in kwargs' assertions. Net: -76 lines (171 added, 247 removed). Inspired by PR #13137 (@kshitijk4poor).
2026-04-25 00:51:20 +00:00 · 2026-04-20 12:23:05 -07:00 · 2026-04-20 12:23:05 -07:00 · 3cba81ebed
commit 3cba81ebed
parent c1977146ce
10 changed files with 170 additions and 246 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -95,85 +95,37 @@ def _normalize_aux_provider(provider: Optional[str]) -> str:
    return _PROVIDER_ALIASES.get(normalized, normalized)


-_FIXED_TEMPERATURE_MODELS: Dict[str, float] = {
-    "kimi-for-coding": 0.6,
-}
+# Sentinel: when returned by _fixed_temperature_for_model(), callers must
+# strip the ``temperature`` key from API kwargs entirely so the provider's
+# server-side default applies.  Kimi/Moonshot models manage temperature
+# internally — sending *any* value (even the "correct" one) can conflict
+# with gateway-side mode selection (thinking → 1.0, non-thinking → 0.6).
+OMIT_TEMPERATURE: object = object()

-# Moonshot's kimi-for-coding endpoint (api.kimi.com/coding) documents:
-# "k2.5 model will use a fixed value 1.0, non-thinking mode will use a fixed
-# value 0.6.  Any other value will result in an error."  The same lock applies
-# to the other k2.* models served on that endpoint.  Enumerated explicitly so
-# non-coding siblings like `kimi-k2-instruct` (variable temperature, served on
-# the standard chat API and third parties) are NOT clamped.
-# Source: https://platform.kimi.ai/docs/guide/kimi-k2-5-quickstart
-_KIMI_INSTANT_MODELS: frozenset = frozenset({
-    "kimi-k2.6",
-    "kimi-k2.5",
-    "kimi-k2-turbo-preview",
-    "kimi-k2-0905-preview",
-})
-_KIMI_THINKING_MODELS: frozenset = frozenset({
-    "kimi-k2-thinking",
-    "kimi-k2-thinking-turbo",
-})

-# Moonshot's public chat endpoint (api.moonshot.ai/v1) enforces a different
-# temperature contract than the Coding Plan endpoint above.  Empirically,
-# `kimi-k2.5` on the public API rejects 0.6 with HTTP 400
-# "invalid temperature: only 1 is allowed for this model" — the Coding Plan
-# lock (0.6 for non-thinking) does not apply.  `kimi-k2-turbo-preview` and the
-# thinking variants already match the Coding Plan contract on the public
-# endpoint, so we only override the models that diverge.
-# Users hit this endpoint when `KIMI_API_KEY` is a legacy `sk-*` key (the
-# `sk-kimi-*` prefix routes to api.kimi.com/coding/v1 instead — see
-# hermes_cli/auth.py:_kimi_base_url_for_key).
-_KIMI_PUBLIC_API_OVERRIDES: Dict[str, float] = {
-    "kimi-k2.5": 1.0,
-}
+def _is_kimi_model(model: Optional[str]) -> bool:
+    """True for any Kimi / Moonshot model that manages temperature server-side."""
+    bare = (model or "").strip().lower().rsplit("/", 1)[-1]
+    return bare.startswith("kimi-") or bare == "kimi"


 def _fixed_temperature_for_model(
    model: Optional[str],
    base_url: Optional[str] = None,
-) -> Optional[float]:
-    """Return a required temperature override for models with strict contracts.
+) -> "Optional[float] | object":
+    """Return a temperature directive for models with strict contracts.

-    Moonshot's kimi-for-coding endpoint rejects any non-approved temperature on
-    the k2.5 family.  Non-thinking variants require exactly 0.6; thinking
-    variants require 1.0.  An optional ``vendor/`` prefix (e.g.
-    ``moonshotai/kimi-k2.5``) is tolerated for aggregator routings.
-
-    When ``base_url`` points to Moonshot's public chat endpoint
-    (``api.moonshot.ai``), the contract changes for ``kimi-k2.5``: the public
-    API only accepts ``temperature=1``, not 0.6.  That override takes precedence
-    over the Coding Plan defaults above.
-
-    Returns ``None`` for every other model, including ``kimi-k2-instruct*``
-    which is the separate non-coding K2 family with variable temperature.
+    Returns:
+        ``OMIT_TEMPERATURE`` — caller must remove the ``temperature`` key so the
+            provider chooses its own default.  Used for all Kimi / Moonshot
+            models whose gateway selects temperature server-side.
+        ``float`` — a specific value the caller must use (reserved for future
+            models with fixed-temperature contracts).
+        ``None`` — no override; caller should use its own default.
    """
-    normalized = (model or "").strip().lower()
-    bare = normalized.rsplit("/", 1)[-1]
-
-    # Public Moonshot API has a stricter contract for some models than the
-    # Coding Plan endpoint — check it first so it wins on conflict.
-    if base_url and ("api.moonshot.ai" in base_url.lower() or "api.moonshot.cn" in base_url.lower()):
-        public = _KIMI_PUBLIC_API_OVERRIDES.get(bare)
-        if public is not None:
-            logger.debug(
-                "Forcing temperature=%s for %r on public Moonshot API", public, model
-            )
-            return public
-
-    fixed = _FIXED_TEMPERATURE_MODELS.get(normalized)
-    if fixed is not None:
-        logger.debug("Forcing temperature=%s for model %r (fixed map)", fixed, model)
-        return fixed
-    if bare in _KIMI_THINKING_MODELS:
-        logger.debug("Forcing temperature=1.0 for kimi thinking model %r", model)
-        return 1.0
-    if bare in _KIMI_INSTANT_MODELS:
-        logger.debug("Forcing temperature=0.6 for kimi instant model %r", model)
-        return 0.6
+    if _is_kimi_model(model):
+        logger.debug("Omitting temperature for Kimi model %r (server-managed)", model)
+        return OMIT_TEMPERATURE
    return None

 # Default auxiliary models for direct API-key providers (cheap/fast for side tasks)
@ -2476,7 +2428,9 @@ def _build_call_kwargs(
    }

    fixed_temperature = _fixed_temperature_for_model(model, base_url)
-    if fixed_temperature is not None:
+    if fixed_temperature is OMIT_TEMPERATURE:
+        temperature = None  # strip — let server choose
+    elif fixed_temperature is not None:
        temperature = fixed_temperature

    # Opus 4.7+ rejects any non-default temperature/top_p/top_k — silently
--- a/mini_swe_runner.py
+++ b/mini_swe_runner.py
@ -47,12 +47,19 @@ def _effective_temperature_for_model(
    model: str,
    base_url: Optional[str] = None,
 ) -> Optional[float]:
-    """Return a fixed temperature for models with strict sampling contracts."""
+    """Return a fixed temperature for models with strict sampling contracts.
+
+    Returns ``None`` when the model manages temperature server-side (Kimi);
+    callers must omit the ``temperature`` kwarg entirely in that case.
+    """
    try:
-        from agent.auxiliary_client import _fixed_temperature_for_model
+        from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE
    except Exception:
        return None
-    return _fixed_temperature_for_model(model, base_url)
+    result = _fixed_temperature_for_model(model, base_url)
+    if result is OMIT_TEMPERATURE:
+        return None  # caller must omit temperature
+    return result



--- a/run_agent.py
+++ b/run_agent.py
@ -6855,12 +6855,15 @@ class AIAgent:
            "timeout": self._resolved_api_call_timeout(),
        }
        try:
-            from agent.auxiliary_client import _fixed_temperature_for_model
+            from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE
        except Exception:
            _fixed_temperature_for_model = None
+            OMIT_TEMPERATURE = None
        if _fixed_temperature_for_model is not None:
            fixed_temperature = _fixed_temperature_for_model(self.model, self.base_url)
-            if fixed_temperature is not None:
+            if fixed_temperature is OMIT_TEMPERATURE:
+                api_kwargs.pop("temperature", None)
+            elif fixed_temperature is not None:
                api_kwargs["temperature"] = fixed_temperature
        if self._is_qwen_portal():
            api_kwargs["metadata"] = {
@ -7301,12 +7304,19 @@ class AIAgent:
            from agent.auxiliary_client import (
                call_llm as _call_llm,
                _fixed_temperature_for_model,
+                OMIT_TEMPERATURE,
            )
            _aux_available = True
-            # Use the fixed-temperature override (e.g. kimi-for-coding → 0.6) if
-            # the model has a strict contract; otherwise the historical 0.3 default.
-            _flush_temperature = _fixed_temperature_for_model(self.model, self.base_url)
-            if _flush_temperature is None:
+            # Kimi models manage temperature server-side — omit it entirely.
+            # Other models with a fixed contract get that value; everyone else
+            # gets the historical 0.3 default.
+            _fixed_temp = _fixed_temperature_for_model(self.model, self.base_url)
+            _omit_temperature = _fixed_temp is OMIT_TEMPERATURE
+            if _omit_temperature:
+                _flush_temperature = None
+            elif _fixed_temp is not None:
+                _flush_temperature = _fixed_temp
+            else:
                _flush_temperature = 0.3
            try:
                response = _call_llm(
@ -7325,7 +7335,10 @@ class AIAgent:
                # No auxiliary client -- use the Codex Responses path directly
                codex_kwargs = self._build_api_kwargs(api_messages)
                codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
+                if _flush_temperature is not None:
                    codex_kwargs["temperature"] = _flush_temperature
+                else:
+                    codex_kwargs.pop("temperature", None)
                if "max_output_tokens" in codex_kwargs:
                    codex_kwargs["max_output_tokens"] = 5120
                response = self._run_codex_stream(codex_kwargs)
@ -7344,9 +7357,10 @@ class AIAgent:
                    "model": self.model,
                    "messages": api_messages,
                    "tools": [memory_tool_def],
-                    "temperature": _flush_temperature,
                    **self._max_tokens_param(5120),
                }
+                if _flush_temperature is not None:
+                    api_kwargs["temperature"] = _flush_temperature
                from agent.auxiliary_client import _get_task_timeout
                response = self._ensure_primary_openai_client(reason="flush_memories").chat.completions.create(
                    **api_kwargs, timeout=_get_task_timeout("flush_memories")
@ -8368,14 +8382,17 @@ class AIAgent:

            summary_extra_body = {}
            try:
-                from agent.auxiliary_client import _fixed_temperature_for_model
+                from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE as _OMIT_TEMP
            except Exception:
                _fixed_temperature_for_model = None
-            _summary_temperature = (
+                _OMIT_TEMP = None
+            _raw_summary_temp = (
                _fixed_temperature_for_model(self.model, self.base_url)
                if _fixed_temperature_for_model is not None
                else None
            )
+            _omit_summary_temperature = _raw_summary_temp is _OMIT_TEMP
+            _summary_temperature = None if _omit_summary_temperature else _raw_summary_temp
            _is_nous = "nousresearch" in self._base_url_lower
            if self._supports_reasoning_extra_body():
                if self.reasoning_config is not None:
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@ -696,27 +696,46 @@ class TestIsConnectionError:
        assert _is_connection_error(err) is False


-class TestKimiForCodingTemperature:
-    """Moonshot kimi-for-coding models require fixed temperatures.
+class TestKimiTemperatureOmitted:
+    """Kimi/Moonshot models should have temperature OMITTED from API kwargs.

-    k2.5 / k2-turbo-preview / k2-0905-preview → 0.6 (non-thinking lock).
-    k2-thinking / k2-thinking-turbo → 1.0 (thinking lock).
-    kimi-k2-instruct* and every other model preserve the caller's temperature.
+    The Kimi gateway selects the correct temperature server-side based on the
+    active mode (thinking → 1.0, non-thinking → 0.6).  Sending any temperature
+    value conflicts with gateway-managed defaults.
    """

-    def test_build_call_kwargs_forces_fixed_temperature(self):
+    @pytest.mark.parametrize(
+        "model",
+        [
+            "kimi-for-coding",
+            "kimi-k2.5",
+            "kimi-k2.6",
+            "kimi-k2-turbo-preview",
+            "kimi-k2-0905-preview",
+            "kimi-k2-thinking",
+            "kimi-k2-thinking-turbo",
+            "kimi-k2-instruct",
+            "kimi-k2-instruct-0905",
+            "moonshotai/kimi-k2.5",
+            "moonshotai/Kimi-K2-Thinking",
+            "moonshotai/Kimi-K2-Instruct",
+        ],
+    )
+    def test_kimi_models_omit_temperature(self, model):
+        """No kimi model should have a temperature key in kwargs."""
        from agent.auxiliary_client import _build_call_kwargs

        kwargs = _build_call_kwargs(
            provider="kimi-coding",
-            model="kimi-for-coding",
+            model=model,
            messages=[{"role": "user", "content": "hello"}],
            temperature=0.3,
        )

-        assert kwargs["temperature"] == 0.6
+        assert "temperature" not in kwargs

-    def test_build_call_kwargs_injects_temperature_when_missing(self):
+    def test_kimi_for_coding_no_temperature_when_none(self):
+        """When caller passes temperature=None, still no temperature key."""
        from agent.auxiliary_client import _build_call_kwargs

        kwargs = _build_call_kwargs(
@ -726,9 +745,9 @@ class TestKimiForCodingTemperature:
            temperature=None,
        )

-        assert kwargs["temperature"] == 0.6
+        assert "temperature" not in kwargs

-    def test_auto_routed_kimi_for_coding_sync_call_uses_fixed_temperature(self):
+    def test_sync_call_omits_temperature(self):
        client = MagicMock()
        client.base_url = "https://api.kimi.com/coding/v1"
        response = MagicMock()
@ -750,10 +769,10 @@ class TestKimiForCodingTemperature:
        assert result is response
        kwargs = client.chat.completions.create.call_args.kwargs
        assert kwargs["model"] == "kimi-for-coding"
-        assert kwargs["temperature"] == 0.6
+        assert "temperature" not in kwargs

    @pytest.mark.asyncio
-    async def test_auto_routed_kimi_for_coding_async_call_uses_fixed_temperature(self):
+    async def test_async_call_omits_temperature(self):
        client = MagicMock()
        client.base_url = "https://api.kimi.com/coding/v1"
        response = MagicMock()
@ -775,52 +794,17 @@ class TestKimiForCodingTemperature:
        assert result is response
        kwargs = client.chat.completions.create.call_args.kwargs
        assert kwargs["model"] == "kimi-for-coding"
-        assert kwargs["temperature"] == 0.6
-
-    @pytest.mark.parametrize(
-        "model,expected",
-        [
-            ("kimi-k2.5", 0.6),
-            ("kimi-k2-turbo-preview", 0.6),
-            ("kimi-k2-0905-preview", 0.6),
-            ("kimi-k2-thinking", 1.0),
-            ("kimi-k2-thinking-turbo", 1.0),
-            ("moonshotai/kimi-k2.5", 0.6),
-            ("moonshotai/Kimi-K2-Thinking", 1.0),
-        ],
-    )
-    def test_kimi_k2_family_temperature_override(self, model, expected):
-        """Moonshot kimi-k2.* models only accept fixed temperatures.
-
-        Non-thinking models → 0.6, thinking-mode models → 1.0.
-        """
-        from agent.auxiliary_client import _build_call_kwargs
-
-        kwargs = _build_call_kwargs(
-            provider="kimi-coding",
-            model=model,
-            messages=[{"role": "user", "content": "hello"}],
-            temperature=0.3,
-        )
-
-        assert kwargs["temperature"] == expected
+        assert "temperature" not in kwargs

    @pytest.mark.parametrize(
        "model",
        [
            "anthropic/claude-sonnet-4-6",
            "gpt-5.4",
-            # kimi-k2-instruct is the non-coding K2 family — temperature is
-            # variable (recommended 0.6 but not enforced).  Must not clamp.
-            "kimi-k2-instruct",
-            "moonshotai/Kimi-K2-Instruct",
-            "moonshotai/Kimi-K2-Instruct-0905",
-            "kimi-k2-instruct-0905",
-            # Hypothetical future kimi name not in the whitelist.
-            "kimi-k2-experimental",
+            "deepseek-chat",
        ],
    )
-    def test_non_restricted_model_preserves_temperature(self, model):
+    def test_non_kimi_models_preserve_temperature(self, model):
        from agent.auxiliary_client import _build_call_kwargs

        kwargs = _build_call_kwargs(
@ -832,25 +816,16 @@ class TestKimiForCodingTemperature:

        assert kwargs["temperature"] == 0.3

-    # ── Endpoint-aware overrides: api.moonshot.ai vs api.kimi.com/coding ──
-    # The public Moonshot chat endpoint and the Coding Plan endpoint enforce
-    # different temperature contracts for the same model name.  `kimi-k2.5` on
-    # api.moonshot.ai rejects 0.6 with HTTP 400 "only 1 is allowed for this
-    # model", while the Coding Plan docs mandate 0.6.  Override must pick the
-    # right value per base_url.
-
    @pytest.mark.parametrize(
        "base_url",
        [
            "https://api.moonshot.ai/v1",
-            "https://api.moonshot.ai/v1/",
-            "https://API.MOONSHOT.AI/v1",
            "https://api.moonshot.cn/v1",
-            "https://api.moonshot.cn/v1/",
+            "https://api.kimi.com/coding/v1",
        ],
    )
-    def test_kimi_k2_5_public_api_forces_temperature_1(self, base_url):
-        """kimi-k2.5 on the public Moonshot API only accepts temperature=1."""
+    def test_kimi_k2_5_omits_temperature_regardless_of_endpoint(self, base_url):
+        """Temperature is omitted regardless of which Kimi endpoint is used."""
        from agent.auxiliary_client import _build_call_kwargs

        kwargs = _build_call_kwargs(
@ -861,64 +836,7 @@ class TestKimiForCodingTemperature:
            base_url=base_url,
        )

-        assert kwargs["temperature"] == 1.0
-
-    def test_kimi_k2_5_coding_plan_keeps_temperature_0_6(self):
-        """kimi-k2.5 on api.kimi.com/coding keeps the Coding Plan's 0.6 lock."""
-        from agent.auxiliary_client import _build_call_kwargs
-
-        kwargs = _build_call_kwargs(
-            provider="kimi-coding",
-            model="kimi-k2.5",
-            messages=[{"role": "user", "content": "hello"}],
-            temperature=0.1,
-            base_url="https://api.kimi.com/coding/v1",
-        )
-
-        assert kwargs["temperature"] == 0.6
-
-    def test_kimi_k2_5_no_base_url_falls_back_to_coding_plan_lock(self):
-        """Without a base_url hint, the Coding Plan default (0.6) applies.
-
-        Preserves PR #12144 backward compatibility for callers that don't thread
-        the client's base_url through.
-        """
-        from agent.auxiliary_client import _build_call_kwargs
-
-        kwargs = _build_call_kwargs(
-            provider="kimi-coding",
-            model="kimi-k2.5",
-            messages=[{"role": "user", "content": "hello"}],
-            temperature=0.1,
-        )
-
-        assert kwargs["temperature"] == 0.6
-
-    @pytest.mark.parametrize(
-        "model,expected",
-        [
-            # Only kimi-k2.5 diverges on api.moonshot.ai; the rest keep the
-            # Coding Plan lock (empirically verified against Moonshot in April
-            # 2026: turbo-preview accepts 0.6, thinking-turbo accepts 1.0).
-            ("kimi-k2-turbo-preview", 0.6),
-            ("kimi-k2-0905-preview", 0.6),
-            ("kimi-k2-thinking", 1.0),
-            ("kimi-k2-thinking-turbo", 1.0),
-            ("moonshotai/kimi-k2-thinking-turbo", 1.0),
-        ],
-    )
-    def test_other_kimi_k2_family_unchanged_on_public_api(self, model, expected):
-        from agent.auxiliary_client import _build_call_kwargs
-
-        kwargs = _build_call_kwargs(
-            provider="kimi-coding",
-            model=model,
-            messages=[{"role": "user", "content": "hello"}],
-            temperature=0.1,
-            base_url="https://api.moonshot.ai/v1",
-        )
-
-        assert kwargs["temperature"] == expected
+        assert "temperature" not in kwargs


 # ---------------------------------------------------------------------------
--- a/tests/run_agent/test_provider_parity.py
+++ b/tests/run_agent/test_provider_parity.py
@ -251,8 +251,12 @@ class TestBuildApiKwargsChatCompletionsServiceTier:
        assert "service_tier" not in kwargs


-class TestBuildApiKwargsKimiFixedTemperature:
-    def test_kimi_for_coding_forces_temperature_on_main_chat_path(self, monkeypatch):
+class TestBuildApiKwargsKimiNoTemperatureOverride:
+    def test_kimi_for_coding_omits_temperature(self, monkeypatch):
+        """Temperature should NOT be set client-side for Kimi models.
+
+        The Kimi gateway selects the correct temperature server-side.
+        """
        agent = _make_agent(
            monkeypatch,
            "kimi-coding",
@ -261,7 +265,7 @@ class TestBuildApiKwargsKimiFixedTemperature:
        )
        messages = [{"role": "user", "content": "hi"}]
        kwargs = agent._build_api_kwargs(messages)
-        assert kwargs["temperature"] == 0.6
+        assert "temperature" not in kwargs


 class TestBuildApiKwargsAIGateway:
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -918,7 +918,11 @@ class TestBuildApiKwargs:
        assert kwargs["messages"] is messages
        assert kwargs["timeout"] == 1800.0

-    def test_public_moonshot_kimi_k2_5_forces_temperature_1(self, agent):
+    def test_public_moonshot_kimi_k2_5_omits_temperature(self, agent):
+        """Kimi models should NOT have client-side temperature overrides.
+
+        The Kimi gateway selects the correct temperature server-side.
+        """
        agent.base_url = "https://api.moonshot.ai/v1"
        agent._base_url_lower = agent.base_url.lower()
        agent.model = "kimi-k2.5"
@ -926,9 +930,9 @@ class TestBuildApiKwargs:

        kwargs = agent._build_api_kwargs(messages)

-        assert kwargs["temperature"] == 1.0
+        assert "temperature" not in kwargs

-    def test_public_moonshot_cn_kimi_k2_5_forces_temperature_1(self, agent):
+    def test_public_moonshot_cn_kimi_k2_5_omits_temperature(self, agent):
        agent.base_url = "https://api.moonshot.cn/v1"
        agent._base_url_lower = agent.base_url.lower()
        agent.model = "kimi-k2.5"
@ -936,9 +940,9 @@ class TestBuildApiKwargs:

        kwargs = agent._build_api_kwargs(messages)

-        assert kwargs["temperature"] == 1.0
+        assert "temperature" not in kwargs

-    def test_kimi_coding_endpoint_keeps_kimi_k2_5_at_0_6(self, agent):
+    def test_kimi_coding_endpoint_omits_temperature(self, agent):
        agent.base_url = "https://api.kimi.com/coding/v1"
        agent._base_url_lower = agent.base_url.lower()
        agent.model = "kimi-k2.5"
@ -946,7 +950,7 @@ class TestBuildApiKwargs:

        kwargs = agent._build_api_kwargs(messages)

-        assert kwargs["temperature"] == 0.6
+        assert "temperature" not in kwargs

    def test_provider_preferences_injected(self, agent):
        agent.base_url = "https://openrouter.ai/api/v1"
--- a/tests/test_mini_swe_runner.py
+++ b/tests/test_mini_swe_runner.py
@ -2,7 +2,11 @@ from types import SimpleNamespace
 from unittest.mock import MagicMock, patch


-def test_run_task_forces_kimi_fixed_temperature():
+def test_run_task_kimi_omits_temperature():
+    """Kimi models should NOT have client-side temperature overrides.
+
+    The Kimi gateway selects the correct temperature server-side.
+    """
    with patch("openai.OpenAI") as mock_openai:
        client = MagicMock()
        client.chat.completions.create.return_value = SimpleNamespace(
@ -25,10 +29,11 @@ def test_run_task_forces_kimi_fixed_temperature():
        result = runner.run_task("2+2")

    assert result["completed"] is True
-    assert client.chat.completions.create.call_args.kwargs["temperature"] == 0.6
+    assert "temperature" not in client.chat.completions.create.call_args.kwargs


-def test_run_task_public_moonshot_kimi_k2_5_forces_temperature_1():
+def test_run_task_public_moonshot_kimi_k2_5_omits_temperature():
+    """kimi-k2.5 on the public Moonshot API should not get a forced temperature."""
    with patch("openai.OpenAI") as mock_openai:
        client = MagicMock()
        client.base_url = "https://api.moonshot.ai/v1"
@ -52,4 +57,4 @@ def test_run_task_public_moonshot_kimi_k2_5_forces_temperature_1():
        result = runner.run_task("2+2")

    assert result["completed"] is True
-    assert client.chat.completions.create.call_args.kwargs["temperature"] == 1.0
+    assert "temperature" not in client.chat.completions.create.call_args.kwargs
--- a/tests/test_trajectory_compressor.py
+++ b/tests/test_trajectory_compressor.py
@ -31,7 +31,8 @@ def test_import_loads_env_from_hermes_home(tmp_path, monkeypatch):
    assert os.getenv("OPENROUTER_API_KEY") == "from-hermes-home"


-def test_generate_summary_custom_client_forces_kimi_temperature():
+def test_generate_summary_kimi_omits_temperature():
+    """Kimi models should have temperature omitted — server manages it."""
    config = CompressionConfig(
        summarization_model="kimi-for-coding",
        temperature=0.3,
@ -51,10 +52,11 @@ def test_generate_summary_custom_client_forces_kimi_temperature():
    result = compressor._generate_summary("tool output", metrics)

    assert result.startswith("[CONTEXT SUMMARY]:")
-    assert compressor.client.chat.completions.create.call_args.kwargs["temperature"] == 0.6
+    assert "temperature" not in compressor.client.chat.completions.create.call_args.kwargs


-def test_generate_summary_public_moonshot_kimi_k2_5_forces_temperature_1():
+def test_generate_summary_public_moonshot_kimi_k2_5_omits_temperature():
+    """kimi-k2.5 on the public Moonshot API should not get a forced temperature."""
    config = CompressionConfig(
        summarization_model="kimi-k2.5",
        base_url="https://api.moonshot.ai/v1",
@ -75,10 +77,11 @@ def test_generate_summary_public_moonshot_kimi_k2_5_forces_temperature_1():
    result = compressor._generate_summary("tool output", metrics)

    assert result.startswith("[CONTEXT SUMMARY]:")
-    assert compressor.client.chat.completions.create.call_args.kwargs["temperature"] == 1.0
+    assert "temperature" not in compressor.client.chat.completions.create.call_args.kwargs


-def test_generate_summary_public_moonshot_cn_kimi_k2_5_forces_temperature_1():
+def test_generate_summary_public_moonshot_cn_kimi_k2_5_omits_temperature():
+    """kimi-k2.5 on api.moonshot.cn should not get a forced temperature."""
    config = CompressionConfig(
        summarization_model="kimi-k2.5",
        base_url="https://api.moonshot.cn/v1",
@ -99,7 +102,7 @@ def test_generate_summary_public_moonshot_cn_kimi_k2_5_forces_temperature_1():
    result = compressor._generate_summary("tool output", metrics)

    assert result.startswith("[CONTEXT SUMMARY]:")
-    assert compressor.client.chat.completions.create.call_args.kwargs["temperature"] == 1.0
+    assert "temperature" not in compressor.client.chat.completions.create.call_args.kwargs


 # ---------------------------------------------------------------------------
--- a/tests/test_trajectory_compressor_async.py
+++ b/tests/test_trajectory_compressor_async.py
@ -117,7 +117,8 @@ class TestSourceLineVerification:


@pytest.mark.asyncio
-async def test_generate_summary_async_custom_client_forces_kimi_temperature():
+async def test_generate_summary_async_kimi_omits_temperature():
+    """Kimi models should have temperature omitted — server manages it."""
    from trajectory_compressor import CompressionConfig, TrajectoryCompressor, TrajectoryMetrics

    config = CompressionConfig(
@ -140,11 +141,12 @@ async def test_generate_summary_async_custom_client_forces_kimi_temperature():
    result = await compressor._generate_summary_async("tool output", metrics)

    assert result.startswith("[CONTEXT SUMMARY]:")
-    assert async_client.chat.completions.create.call_args.kwargs["temperature"] == 0.6
+    assert "temperature" not in async_client.chat.completions.create.call_args.kwargs


@pytest.mark.asyncio
-async def test_generate_summary_async_public_moonshot_kimi_k2_5_forces_temperature_1():
+async def test_generate_summary_async_public_moonshot_kimi_k2_5_omits_temperature():
+    """kimi-k2.5 on the public Moonshot API should not get a forced temperature."""
    from trajectory_compressor import CompressionConfig, TrajectoryCompressor, TrajectoryMetrics

    config = CompressionConfig(
@ -168,12 +170,12 @@ async def test_generate_summary_async_public_moonshot_kimi_k2_5_forces_temperatu
    result = await compressor._generate_summary_async("tool output", metrics)

    assert result.startswith("[CONTEXT SUMMARY]:")
-    assert async_client.chat.completions.create.call_args.kwargs["temperature"] == 1.0
-
+    assert "temperature" not in async_client.chat.completions.create.call_args.kwargs


@pytest.mark.asyncio
-async def test_generate_summary_async_public_moonshot_cn_kimi_k2_5_forces_temperature_1():
+async def test_generate_summary_async_public_moonshot_cn_kimi_k2_5_omits_temperature():
+    """kimi-k2.5 on api.moonshot.cn should not get a forced temperature."""
    from trajectory_compressor import CompressionConfig, TrajectoryCompressor, TrajectoryMetrics

    config = CompressionConfig(
@ -197,4 +199,4 @@ async def test_generate_summary_async_public_moonshot_cn_kimi_k2_5_forces_temper
    result = await compressor._generate_summary_async("tool output", metrics)

    assert result.startswith("[CONTEXT SUMMARY]:")
-    assert async_client.chat.completions.create.call_args.kwargs["temperature"] == 1.0
+    assert "temperature" not in async_client.chat.completions.create.call_args.kwargs
--- a/trajectory_compressor.py
+++ b/trajectory_compressor.py
@ -58,14 +58,20 @@ def _effective_temperature_for_model(
    model: str,
    requested_temperature: float,
    base_url: Optional[str] = None,
-) -> float:
-    """Apply fixed model temperature contracts to direct client calls."""
+) -> Optional[float]:
+    """Apply fixed model temperature contracts to direct client calls.
+
+    Returns ``None`` when the model manages temperature server-side (Kimi);
+    callers must omit the ``temperature`` kwarg entirely in that case.
+    """
    try:
-        from agent.auxiliary_client import _fixed_temperature_for_model
+        from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE
    except Exception:
        return requested_temperature

    fixed_temperature = _fixed_temperature_for_model(model, base_url)
+    if fixed_temperature is OMIT_TEMPERATURE:
+        return None  # caller must omit temperature
    if fixed_temperature is not None:
        return fixed_temperature
    return requested_temperature
@ -600,12 +606,14 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
                        max_tokens=self.config.summary_target_tokens * 2,
                    )
                else:
-                    response = self.client.chat.completions.create(
-                        model=self.config.summarization_model,
-                        messages=[{"role": "user", "content": prompt}],
-                        temperature=summary_temperature,
-                        max_tokens=self.config.summary_target_tokens * 2,
-                    )
+                    _create_kwargs = {
+                        "model": self.config.summarization_model,
+                        "messages": [{"role": "user", "content": prompt}],
+                        "max_tokens": self.config.summary_target_tokens * 2,
+                    }
+                    if summary_temperature is not None:
+                        _create_kwargs["temperature"] = summary_temperature
+                    response = self.client.chat.completions.create(**_create_kwargs)
                
                summary = self._coerce_summary_content(response.choices[0].message.content)
                return self._ensure_summary_prefix(summary)
@ -667,12 +675,14 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
                        max_tokens=self.config.summary_target_tokens * 2,
                    )
                else:
-                    response = await self._get_async_client().chat.completions.create(
-                        model=self.config.summarization_model,
-                        messages=[{"role": "user", "content": prompt}],
-                        temperature=summary_temperature,
-                        max_tokens=self.config.summary_target_tokens * 2,
-                    )
+                    _create_kwargs = {
+                        "model": self.config.summarization_model,
+                        "messages": [{"role": "user", "content": prompt}],
+                        "max_tokens": self.config.summary_target_tokens * 2,
+                    }
+                    if summary_temperature is not None:
+                        _create_kwargs["temperature"] = summary_temperature
+                    response = await self._get_async_client().chat.completions.create(**_create_kwargs)
                
                summary = self._coerce_summary_content(response.choices[0].message.content)
                return self._ensure_summary_prefix(summary)