From 00a8252b7d33dc56fe33541b2bf04f197d38258a Mon Sep 17 00:00:00 2001 From: YuShu <24110240104@m.fudan.edu.cn> Date: Tue, 21 Apr 2026 00:56:28 +0800 Subject: [PATCH] fix(agent): scope Ollama/GLM stop-to-length heuristic to Ollama only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The _is_ollama_glm_backend() function was too broad: any local endpoint running a GLM model was treated as Ollama, triggering the stop->length misreport heuristic introduced in 8011aa3. This caused false truncation detection on sglang, vLLM, LM Studio, and other non-Ollama servers that correctly report finish_reason. When a GLM model on sglang/vLLM returned finish_reason='stop', the agent mistakenly reclassified it as 'length' if the response didn't end with a whitelisted punctuation character (ASCII or CJK). This particularly affected Chinese-language responses and Markdown-formatted text. Root cause: the is_local_endpoint() fallback assumed any local GLM endpoint = Ollama. But many non-Ollama servers also run on localhost. Fix: remove the is_local_endpoint() catch-all. Only detect Ollama via its distinctive signatures (port 11434, 'ollama' in URL). All other local servers are assumed to report finish_reason correctly. This is the correct tradeoff because: - False negatives (Ollama at custom port, heuristic not triggered) only mean the user sees a truncated response — same as having no heuristic - False positives (non-Ollama server, heuristic wrongly triggered) inject spurious continuation messages into the conversation — strictly worse Adds two tests: - sglang GLM response is NOT reclassified as truncated - Ollama GLM on port 11434 still triggers the heuristic as before Co-authored-by: Hermes Agent --- run_agent.py | 20 ++++++-- tests/run_agent/test_run_agent.py | 77 +++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 4 deletions(-) diff --git a/run_agent.py b/run_agent.py index 8026b024e71..674758d3315 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1367,14 +1367,26 @@ class AIAgent: return False def _is_ollama_glm_backend(self) -> bool: - """Detect the narrow backend family affected by Ollama/GLM stop misreports.""" + """Detect the narrow backend family affected by Ollama/GLM stop misreports. + + Only returns True for backends that are known to be Ollama, which + can misreport truncated output as finish_reason='stop'. Other local + servers (sglang, vLLM, LM Studio, etc.) report finish_reason correctly + and must NOT be subjected to the stop->length heuristic. + + Detection relies on explicit Ollama signatures: + - Port 11434 (Ollama default) + - "ollama" in the base URL (e.g. ollama.local, /ollama/ path) + + The previous is_local_endpoint() fallback was too broad and caused + false truncation detection on non-Ollama local servers hosting GLM + models (sglang, vLLM, etc.). + """ model_lower = (self.model or "").lower() provider_lower = (self.provider or "").lower() if "glm" not in model_lower and provider_lower != "zai": return False - if "ollama" in self._base_url_lower or ":11434" in self._base_url_lower: - return True - return bool(self.base_url and is_local_endpoint(self.base_url)) + return "ollama" in self._base_url_lower or ":11434" in self._base_url_lower def _should_treat_stop_as_truncated( self, diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 1fce5ba164b..ca6f9ca7c99 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -4526,6 +4526,83 @@ class TestRunConversation: assert result["api_calls"] == 2 assert result["final_response"] == "Based on the search results, the best next" + def test_sglang_glm_stop_without_terminal_boundary_does_not_continue(self, agent): + """sglang/vLLM-hosted GLM models report finish_reason correctly. + + The stop->length workaround must NOT apply to non-Ollama local + servers that expose OpenAI-compatible /v1 endpoints (sglang, vLLM, + LM Studio, etc.). A Chinese-text response ending without ASCII + punctuation should not be reclassified as truncated. + """ + self._setup_agent(agent) + agent.base_url = "http://127.0.0.1:60000/v1" + agent._base_url_lower = agent.base_url.lower() + agent.model = "glm-5-fp8" + + tool_turn = _mock_response( + content="", + finish_reason="tool_calls", + tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")], + ) + # Response ends with Chinese character (no ASCII punctuation) — NOT truncated + normal_stop = _mock_response( + content="根据搜索结果,建议修改配置", + finish_reason="stop", + ) + agent.client.chat.completions.create.side_effect = [tool_turn, normal_stop] + + with ( + patch("run_agent.handle_function_call", return_value="search result"), + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello") + + assert result["completed"] is True + assert result["api_calls"] == 2 + assert result["final_response"] == "根据搜索结果,建议修改配置" + + def test_ollama_glm_on_port_11434_still_triggers_heuristic(self, agent): + """Ollama on port 11434 should still trigger the stop->length heuristic.""" + self._setup_agent(agent) + agent.base_url = "http://localhost:11434/v1" + agent._base_url_lower = agent.base_url.lower() + agent.model = "glm-5.1:cloud" + + tool_turn = _mock_response( + content="", + finish_reason="tool_calls", + tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")], + ) + misreported_stop = _mock_response( + content="Based on the search results, the best next", + finish_reason="stop", + ) + continued = _mock_response( + content=" step is to update the config.", + finish_reason="stop", + ) + agent.client.chat.completions.create.side_effect = [ + tool_turn, + misreported_stop, + continued, + ] + + with ( + patch("run_agent.handle_function_call", return_value="search result"), + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello") + + assert result["completed"] is True + assert result["api_calls"] == 3 + third_call_messages = agent.client.chat.completions.create.call_args_list[2].kwargs["messages"] + assert "truncated by the output length limit" in third_call_messages[-1]["content"] + + def test_length_thinking_exhausted_skips_continuation(self, agent): """When finish_reason='length' but content is only thinking, skip retries.""" self._setup_agent(agent)