fix(agent): continue ollama glm truncation replies

This commit is contained in:
LeonSGP43 2026-04-16 12:31:24 +08:00 committed by Teknium
parent 1b61ec470b
commit 8011aa31ba
2 changed files with 172 additions and 0 deletions

View file

@ -2103,6 +2103,59 @@ class AIAgent:
content = re.sub(r'</?(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>\s*', '', content, flags=re.IGNORECASE)
return content
@staticmethod
def _has_natural_response_ending(content: str) -> bool:
"""Heuristic: does visible assistant text look intentionally finished?"""
if not content:
return False
stripped = content.rstrip()
if not stripped:
return False
if stripped.endswith("```"):
return True
return stripped[-1] in '.!?:)"\']}。!?:)】」』》'
def _is_ollama_glm_backend(self) -> bool:
"""Detect the narrow backend family affected by Ollama/GLM stop misreports."""
model_lower = (self.model or "").lower()
provider_lower = (self.provider or "").lower()
if "glm" not in model_lower and provider_lower != "zai":
return False
if "ollama" in self._base_url_lower or ":11434" in self._base_url_lower:
return True
return bool(self.base_url and is_local_endpoint(self.base_url))
def _should_treat_stop_as_truncated(
self,
finish_reason: str,
assistant_message,
messages: Optional[list] = None,
) -> bool:
"""Detect conservative stop->length misreports for Ollama-hosted GLM models."""
if finish_reason != "stop" or self.api_mode != "chat_completions":
return False
if not self._is_ollama_glm_backend():
return False
if not any(
isinstance(msg, dict) and msg.get("role") == "tool"
for msg in (messages or [])
):
return False
if assistant_message is None or getattr(assistant_message, "tool_calls", None):
return False
content = getattr(assistant_message, "content", None)
if not isinstance(content, str):
return False
visible_text = self._strip_think_blocks(content).strip()
if not visible_text:
return False
if len(visible_text) < 20 or not re.search(r"\s", visible_text):
return False
return not self._has_natural_response_ending(visible_text)
def _looks_like_codex_intermediate_ack(
self,
user_message: str,
@ -9038,6 +9091,17 @@ class AIAgent:
finish_reason = stop_reason_map.get(response.stop_reason, "stop")
else:
finish_reason = response.choices[0].finish_reason
assistant_message = response.choices[0].message
if self._should_treat_stop_as_truncated(
finish_reason,
assistant_message,
messages,
):
self._vprint(
f"{self.log_prefix}⚠️ Treating suspicious Ollama/GLM stop response as truncated",
force=True,
)
finish_reason = "length"
if finish_reason == "length":
self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True)

View file

@ -2202,6 +2202,114 @@ class TestRunConversation:
assert second_call_messages[-1]["role"] == "user"
assert "truncated by the output length limit" in second_call_messages[-1]["content"]
def test_ollama_glm_stop_after_tools_without_terminal_boundary_requests_continuation(self, agent):
"""Ollama-hosted GLM responses can misreport truncated output as stop."""
self._setup_agent(agent)
agent.base_url = "http://localhost:11434/v1"
agent._base_url_lower = agent.base_url.lower()
agent.model = "glm-5.1:cloud"
tool_turn = _mock_response(
content="",
finish_reason="tool_calls",
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
)
misreported_stop = _mock_response(
content="Based on the search results, the best next",
finish_reason="stop",
)
continued = _mock_response(
content=" step is to update the config.",
finish_reason="stop",
)
agent.client.chat.completions.create.side_effect = [
tool_turn,
misreported_stop,
continued,
]
with (
patch("run_agent.handle_function_call", return_value="search result"),
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("hello")
assert result["completed"] is True
assert result["api_calls"] == 3
assert (
result["final_response"]
== "Based on the search results, the best next step is to update the config."
)
third_call_messages = agent.client.chat.completions.create.call_args_list[2].kwargs["messages"]
assert third_call_messages[-1]["role"] == "user"
assert "truncated by the output length limit" in third_call_messages[-1]["content"]
def test_ollama_glm_stop_with_terminal_boundary_does_not_continue(self, agent):
"""Complete Ollama/GLM responses should not be reclassified as truncated."""
self._setup_agent(agent)
agent.base_url = "http://localhost:11434/v1"
agent._base_url_lower = agent.base_url.lower()
agent.model = "glm-5.1:cloud"
tool_turn = _mock_response(
content="",
finish_reason="tool_calls",
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
)
complete_stop = _mock_response(
content="Based on the search results, the best next step is to update the config.",
finish_reason="stop",
)
agent.client.chat.completions.create.side_effect = [tool_turn, complete_stop]
with (
patch("run_agent.handle_function_call", return_value="search result"),
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("hello")
assert result["completed"] is True
assert result["api_calls"] == 2
assert (
result["final_response"]
== "Based on the search results, the best next step is to update the config."
)
def test_non_ollama_stop_without_terminal_boundary_does_not_continue(self, agent):
"""The stop->length workaround should stay scoped to Ollama/GLM backends."""
self._setup_agent(agent)
agent.base_url = "https://api.openai.com/v1"
agent._base_url_lower = agent.base_url.lower()
agent.model = "gpt-4o-mini"
tool_turn = _mock_response(
content="",
finish_reason="tool_calls",
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
)
normal_stop = _mock_response(
content="Based on the search results, the best next",
finish_reason="stop",
)
agent.client.chat.completions.create.side_effect = [tool_turn, normal_stop]
with (
patch("run_agent.handle_function_call", return_value="search result"),
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("hello")
assert result["completed"] is True
assert result["api_calls"] == 2
assert result["final_response"] == "Based on the search results, the best next"
def test_length_thinking_exhausted_skips_continuation(self, agent):
"""When finish_reason='length' but content is only thinking, skip retries."""
self._setup_agent(agent)