fix(agent): continue ollama glm truncation replies

2026-07-03 12:23:08 +00:00 · 2026-04-16 12:31:24 +08:00 · 2026-04-16 12:31:24 +08:00 · 8011aa31ba
commit 8011aa31ba
parent 1b61ec470b
2 changed files with 172 additions and 0 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -2103,6 +2103,59 @@ class AIAgent:
        content = re.sub(r'</?(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>\s*', '', content, flags=re.IGNORECASE)
        return content
    @staticmethod
    def _has_natural_response_ending(content: str) -> bool:
        """Heuristic: does visible assistant text look intentionally finished?"""
        if not content:
            return False
        stripped = content.rstrip()
        if not stripped:
            return False
        if stripped.endswith("```"):
            return True
        return stripped[-1] in '.!?:)"\']}。！？：）】」』》'
    def _is_ollama_glm_backend(self) -> bool:
        """Detect the narrow backend family affected by Ollama/GLM stop misreports."""
        model_lower = (self.model or "").lower()
        provider_lower = (self.provider or "").lower()
        if "glm" not in model_lower and provider_lower != "zai":
            return False
        if "ollama" in self._base_url_lower or ":11434" in self._base_url_lower:
            return True
        return bool(self.base_url and is_local_endpoint(self.base_url))
    def _should_treat_stop_as_truncated(
        self,
        finish_reason: str,
        assistant_message,
        messages: Optional[list] = None,
    ) -> bool:
        """Detect conservative stop->length misreports for Ollama-hosted GLM models."""
        if finish_reason != "stop" or self.api_mode != "chat_completions":
            return False
        if not self._is_ollama_glm_backend():
            return False
        if not any(
            isinstance(msg, dict) and msg.get("role") == "tool"
            for msg in (messages or [])
        ):
            return False
        if assistant_message is None or getattr(assistant_message, "tool_calls", None):
            return False
        content = getattr(assistant_message, "content", None)
        if not isinstance(content, str):
            return False
        visible_text = self._strip_think_blocks(content).strip()
        if not visible_text:
            return False
        if len(visible_text) < 20 or not re.search(r"\s", visible_text):
            return False
        return not self._has_natural_response_ending(visible_text)
    def _looks_like_codex_intermediate_ack(
        self,
        user_message: str,
@ -9038,6 +9091,17 @@ class AIAgent:
                        finish_reason = stop_reason_map.get(response.stop_reason, "stop")
                    else:
                        finish_reason = response.choices[0].finish_reason
                        assistant_message = response.choices[0].message
                        if self._should_treat_stop_as_truncated(
                            finish_reason,
                            assistant_message,
                            messages,
                        ):
                            self._vprint(
                                f"{self.log_prefix}⚠️  Treating suspicious Ollama/GLM stop response as truncated",
                                force=True,
                            )
                            finish_reason = "length"
                    if finish_reason == "length":
                        self._vprint(f"{self.log_prefix}⚠️  Response truncated (finish_reason='length') - model hit max output tokens", force=True)
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -2202,6 +2202,114 @@ class TestRunConversation:
        assert second_call_messages[-1]["role"] == "user"
        assert "truncated by the output length limit" in second_call_messages[-1]["content"]
    def test_ollama_glm_stop_after_tools_without_terminal_boundary_requests_continuation(self, agent):
        """Ollama-hosted GLM responses can misreport truncated output as stop."""
        self._setup_agent(agent)
        agent.base_url = "http://localhost:11434/v1"
        agent._base_url_lower = agent.base_url.lower()
        agent.model = "glm-5.1:cloud"
        tool_turn = _mock_response(
            content="",
            finish_reason="tool_calls",
            tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
        )
        misreported_stop = _mock_response(
            content="Based on the search results, the best next",
            finish_reason="stop",
        )
        continued = _mock_response(
            content=" step is to update the config.",
            finish_reason="stop",
        )
        agent.client.chat.completions.create.side_effect = [
            tool_turn,
            misreported_stop,
            continued,
        ]
        with (
            patch("run_agent.handle_function_call", return_value="search result"),
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
        ):
            result = agent.run_conversation("hello")
        assert result["completed"] is True
        assert result["api_calls"] == 3
        assert (
            result["final_response"]
            == "Based on the search results, the best next step is to update the config."
        )
        third_call_messages = agent.client.chat.completions.create.call_args_list[2].kwargs["messages"]
        assert third_call_messages[-1]["role"] == "user"
        assert "truncated by the output length limit" in third_call_messages[-1]["content"]
    def test_ollama_glm_stop_with_terminal_boundary_does_not_continue(self, agent):
        """Complete Ollama/GLM responses should not be reclassified as truncated."""
        self._setup_agent(agent)
        agent.base_url = "http://localhost:11434/v1"
        agent._base_url_lower = agent.base_url.lower()
        agent.model = "glm-5.1:cloud"
        tool_turn = _mock_response(
            content="",
            finish_reason="tool_calls",
            tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
        )
        complete_stop = _mock_response(
            content="Based on the search results, the best next step is to update the config.",
            finish_reason="stop",
        )
        agent.client.chat.completions.create.side_effect = [tool_turn, complete_stop]
        with (
            patch("run_agent.handle_function_call", return_value="search result"),
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
        ):
            result = agent.run_conversation("hello")
        assert result["completed"] is True
        assert result["api_calls"] == 2
        assert (
            result["final_response"]
            == "Based on the search results, the best next step is to update the config."
        )
    def test_non_ollama_stop_without_terminal_boundary_does_not_continue(self, agent):
        """The stop->length workaround should stay scoped to Ollama/GLM backends."""
        self._setup_agent(agent)
        agent.base_url = "https://api.openai.com/v1"
        agent._base_url_lower = agent.base_url.lower()
        agent.model = "gpt-4o-mini"
        tool_turn = _mock_response(
            content="",
            finish_reason="tool_calls",
            tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
        )
        normal_stop = _mock_response(
            content="Based on the search results, the best next",
            finish_reason="stop",
        )
        agent.client.chat.completions.create.side_effect = [tool_turn, normal_stop]
        with (
            patch("run_agent.handle_function_call", return_value="search result"),
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
        ):
            result = agent.run_conversation("hello")
        assert result["completed"] is True
        assert result["api_calls"] == 2
        assert result["final_response"] == "Based on the search results, the best next"
    def test_length_thinking_exhausted_skips_continuation(self, agent):
        """When finish_reason='length' but content is only thinking, skip retries."""
        self._setup_agent(agent)