From 00a8252b7d33dc56fe33541b2bf04f197d38258a Mon Sep 17 00:00:00 2001
From: YuShu <24110240104@m.fudan.edu.cn>
Date: Tue, 21 Apr 2026 00:56:28 +0800
Subject: [PATCH] fix(agent): scope Ollama/GLM stop-to-length heuristic to
 Ollama only
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The _is_ollama_glm_backend() function was too broad: any local endpoint
running a GLM model was treated as Ollama, triggering the stop->length
misreport heuristic introduced in 8011aa3. This caused false truncation
detection on sglang, vLLM, LM Studio, and other non-Ollama servers that
correctly report finish_reason.

When a GLM model on sglang/vLLM returned finish_reason='stop', the agent
mistakenly reclassified it as 'length' if the response didn't end with
a whitelisted punctuation character (ASCII or CJK). This particularly
affected Chinese-language responses and Markdown-formatted text.

Root cause: the is_local_endpoint() fallback assumed any local GLM
endpoint = Ollama. But many non-Ollama servers also run on localhost.

Fix: remove the is_local_endpoint() catch-all. Only detect Ollama via
its distinctive signatures (port 11434, 'ollama' in URL). All other
local servers are assumed to report finish_reason correctly.

This is the correct tradeoff because:
- False negatives (Ollama at custom port, heuristic not triggered) only
  mean the user sees a truncated response — same as having no heuristic
- False positives (non-Ollama server, heuristic wrongly triggered) inject
  spurious continuation messages into the conversation — strictly worse

Adds two tests:
- sglang GLM response is NOT reclassified as truncated
- Ollama GLM on port 11434 still triggers the heuristic as before

Co-authored-by: Hermes Agent <hermes@nousresearch.com>
---
 run_agent.py                      | 20 ++++++--
 tests/run_agent/test_run_agent.py | 77 +++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/run_agent.py b/run_agent.py
index 8026b024e71..674758d3315 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -1367,14 +1367,26 @@ class AIAgent:
         return False
 
     def _is_ollama_glm_backend(self) -> bool:
-        """Detect the narrow backend family affected by Ollama/GLM stop misreports."""
+        """Detect the narrow backend family affected by Ollama/GLM stop misreports.
+
+        Only returns True for backends that are known to be Ollama, which
+        can misreport truncated output as finish_reason='stop'.  Other local
+        servers (sglang, vLLM, LM Studio, etc.) report finish_reason correctly
+        and must NOT be subjected to the stop->length heuristic.
+
+        Detection relies on explicit Ollama signatures:
+        - Port 11434 (Ollama default)
+        - "ollama" in the base URL (e.g. ollama.local, /ollama/ path)
+
+        The previous is_local_endpoint() fallback was too broad and caused
+        false truncation detection on non-Ollama local servers hosting GLM
+        models (sglang, vLLM, etc.).
+        """
         model_lower = (self.model or "").lower()
         provider_lower = (self.provider or "").lower()
         if "glm" not in model_lower and provider_lower != "zai":
             return False
-        if "ollama" in self._base_url_lower or ":11434" in self._base_url_lower:
-            return True
-        return bool(self.base_url and is_local_endpoint(self.base_url))
+        return "ollama" in self._base_url_lower or ":11434" in self._base_url_lower
 
     def _should_treat_stop_as_truncated(
         self,
diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py
index 1fce5ba164b..ca6f9ca7c99 100644
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@@ -4526,6 +4526,83 @@ class TestRunConversation:
         assert result["api_calls"] == 2
         assert result["final_response"] == "Based on the search results, the best next"
 
+    def test_sglang_glm_stop_without_terminal_boundary_does_not_continue(self, agent):
+        """sglang/vLLM-hosted GLM models report finish_reason correctly.
+
+        The stop->length workaround must NOT apply to non-Ollama local
+        servers that expose OpenAI-compatible /v1 endpoints (sglang, vLLM,
+        LM Studio, etc.).  A Chinese-text response ending without ASCII
+        punctuation should not be reclassified as truncated.
+        """
+        self._setup_agent(agent)
+        agent.base_url = "http://127.0.0.1:60000/v1"
+        agent._base_url_lower = agent.base_url.lower()
+        agent.model = "glm-5-fp8"
+
+        tool_turn = _mock_response(
+            content="",
+            finish_reason="tool_calls",
+            tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
+        )
+        # Response ends with Chinese character (no ASCII punctuation) — NOT truncated
+        normal_stop = _mock_response(
+            content="根据搜索结果，建议修改配置",
+            finish_reason="stop",
+        )
+        agent.client.chat.completions.create.side_effect = [tool_turn, normal_stop]
+
+        with (
+            patch("run_agent.handle_function_call", return_value="search result"),
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            result = agent.run_conversation("hello")
+
+        assert result["completed"] is True
+        assert result["api_calls"] == 2
+        assert result["final_response"] == "根据搜索结果，建议修改配置"
+
+    def test_ollama_glm_on_port_11434_still_triggers_heuristic(self, agent):
+        """Ollama on port 11434 should still trigger the stop->length heuristic."""
+        self._setup_agent(agent)
+        agent.base_url = "http://localhost:11434/v1"
+        agent._base_url_lower = agent.base_url.lower()
+        agent.model = "glm-5.1:cloud"
+
+        tool_turn = _mock_response(
+            content="",
+            finish_reason="tool_calls",
+            tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
+        )
+        misreported_stop = _mock_response(
+            content="Based on the search results, the best next",
+            finish_reason="stop",
+        )
+        continued = _mock_response(
+            content=" step is to update the config.",
+            finish_reason="stop",
+        )
+        agent.client.chat.completions.create.side_effect = [
+            tool_turn,
+            misreported_stop,
+            continued,
+        ]
+
+        with (
+            patch("run_agent.handle_function_call", return_value="search result"),
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            result = agent.run_conversation("hello")
+
+        assert result["completed"] is True
+        assert result["api_calls"] == 3
+        third_call_messages = agent.client.chat.completions.create.call_args_list[2].kwargs["messages"]
+        assert "truncated by the output length limit" in third_call_messages[-1]["content"]
+
+
     def test_length_thinking_exhausted_skips_continuation(self, agent):
         """When finish_reason='length' but content is only thinking, skip retries."""
         self._setup_agent(agent)