mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-12 08:51:53 +00:00
fix(agent): continue ollama glm truncation replies
This commit is contained in:
parent
1b61ec470b
commit
8011aa31ba
2 changed files with 172 additions and 0 deletions
64
run_agent.py
64
run_agent.py
|
|
@ -2103,6 +2103,59 @@ class AIAgent:
|
|||
content = re.sub(r'</?(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>\s*', '', content, flags=re.IGNORECASE)
|
||||
return content
|
||||
|
||||
@staticmethod
|
||||
def _has_natural_response_ending(content: str) -> bool:
|
||||
"""Heuristic: does visible assistant text look intentionally finished?"""
|
||||
if not content:
|
||||
return False
|
||||
stripped = content.rstrip()
|
||||
if not stripped:
|
||||
return False
|
||||
if stripped.endswith("```"):
|
||||
return True
|
||||
return stripped[-1] in '.!?:)"\']}。!?:)】」』》'
|
||||
|
||||
def _is_ollama_glm_backend(self) -> bool:
|
||||
"""Detect the narrow backend family affected by Ollama/GLM stop misreports."""
|
||||
model_lower = (self.model or "").lower()
|
||||
provider_lower = (self.provider or "").lower()
|
||||
if "glm" not in model_lower and provider_lower != "zai":
|
||||
return False
|
||||
if "ollama" in self._base_url_lower or ":11434" in self._base_url_lower:
|
||||
return True
|
||||
return bool(self.base_url and is_local_endpoint(self.base_url))
|
||||
|
||||
def _should_treat_stop_as_truncated(
|
||||
self,
|
||||
finish_reason: str,
|
||||
assistant_message,
|
||||
messages: Optional[list] = None,
|
||||
) -> bool:
|
||||
"""Detect conservative stop->length misreports for Ollama-hosted GLM models."""
|
||||
if finish_reason != "stop" or self.api_mode != "chat_completions":
|
||||
return False
|
||||
if not self._is_ollama_glm_backend():
|
||||
return False
|
||||
if not any(
|
||||
isinstance(msg, dict) and msg.get("role") == "tool"
|
||||
for msg in (messages or [])
|
||||
):
|
||||
return False
|
||||
if assistant_message is None or getattr(assistant_message, "tool_calls", None):
|
||||
return False
|
||||
|
||||
content = getattr(assistant_message, "content", None)
|
||||
if not isinstance(content, str):
|
||||
return False
|
||||
|
||||
visible_text = self._strip_think_blocks(content).strip()
|
||||
if not visible_text:
|
||||
return False
|
||||
if len(visible_text) < 20 or not re.search(r"\s", visible_text):
|
||||
return False
|
||||
|
||||
return not self._has_natural_response_ending(visible_text)
|
||||
|
||||
def _looks_like_codex_intermediate_ack(
|
||||
self,
|
||||
user_message: str,
|
||||
|
|
@ -9038,6 +9091,17 @@ class AIAgent:
|
|||
finish_reason = stop_reason_map.get(response.stop_reason, "stop")
|
||||
else:
|
||||
finish_reason = response.choices[0].finish_reason
|
||||
assistant_message = response.choices[0].message
|
||||
if self._should_treat_stop_as_truncated(
|
||||
finish_reason,
|
||||
assistant_message,
|
||||
messages,
|
||||
):
|
||||
self._vprint(
|
||||
f"{self.log_prefix}⚠️ Treating suspicious Ollama/GLM stop response as truncated",
|
||||
force=True,
|
||||
)
|
||||
finish_reason = "length"
|
||||
|
||||
if finish_reason == "length":
|
||||
self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True)
|
||||
|
|
|
|||
|
|
@ -2202,6 +2202,114 @@ class TestRunConversation:
|
|||
assert second_call_messages[-1]["role"] == "user"
|
||||
assert "truncated by the output length limit" in second_call_messages[-1]["content"]
|
||||
|
||||
def test_ollama_glm_stop_after_tools_without_terminal_boundary_requests_continuation(self, agent):
|
||||
"""Ollama-hosted GLM responses can misreport truncated output as stop."""
|
||||
self._setup_agent(agent)
|
||||
agent.base_url = "http://localhost:11434/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "glm-5.1:cloud"
|
||||
|
||||
tool_turn = _mock_response(
|
||||
content="",
|
||||
finish_reason="tool_calls",
|
||||
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
|
||||
)
|
||||
misreported_stop = _mock_response(
|
||||
content="Based on the search results, the best next",
|
||||
finish_reason="stop",
|
||||
)
|
||||
continued = _mock_response(
|
||||
content=" step is to update the config.",
|
||||
finish_reason="stop",
|
||||
)
|
||||
agent.client.chat.completions.create.side_effect = [
|
||||
tool_turn,
|
||||
misreported_stop,
|
||||
continued,
|
||||
]
|
||||
|
||||
with (
|
||||
patch("run_agent.handle_function_call", return_value="search result"),
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello")
|
||||
|
||||
assert result["completed"] is True
|
||||
assert result["api_calls"] == 3
|
||||
assert (
|
||||
result["final_response"]
|
||||
== "Based on the search results, the best next step is to update the config."
|
||||
)
|
||||
|
||||
third_call_messages = agent.client.chat.completions.create.call_args_list[2].kwargs["messages"]
|
||||
assert third_call_messages[-1]["role"] == "user"
|
||||
assert "truncated by the output length limit" in third_call_messages[-1]["content"]
|
||||
|
||||
def test_ollama_glm_stop_with_terminal_boundary_does_not_continue(self, agent):
|
||||
"""Complete Ollama/GLM responses should not be reclassified as truncated."""
|
||||
self._setup_agent(agent)
|
||||
agent.base_url = "http://localhost:11434/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "glm-5.1:cloud"
|
||||
|
||||
tool_turn = _mock_response(
|
||||
content="",
|
||||
finish_reason="tool_calls",
|
||||
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
|
||||
)
|
||||
complete_stop = _mock_response(
|
||||
content="Based on the search results, the best next step is to update the config.",
|
||||
finish_reason="stop",
|
||||
)
|
||||
agent.client.chat.completions.create.side_effect = [tool_turn, complete_stop]
|
||||
|
||||
with (
|
||||
patch("run_agent.handle_function_call", return_value="search result"),
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello")
|
||||
|
||||
assert result["completed"] is True
|
||||
assert result["api_calls"] == 2
|
||||
assert (
|
||||
result["final_response"]
|
||||
== "Based on the search results, the best next step is to update the config."
|
||||
)
|
||||
|
||||
def test_non_ollama_stop_without_terminal_boundary_does_not_continue(self, agent):
|
||||
"""The stop->length workaround should stay scoped to Ollama/GLM backends."""
|
||||
self._setup_agent(agent)
|
||||
agent.base_url = "https://api.openai.com/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "gpt-4o-mini"
|
||||
|
||||
tool_turn = _mock_response(
|
||||
content="",
|
||||
finish_reason="tool_calls",
|
||||
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
|
||||
)
|
||||
normal_stop = _mock_response(
|
||||
content="Based on the search results, the best next",
|
||||
finish_reason="stop",
|
||||
)
|
||||
agent.client.chat.completions.create.side_effect = [tool_turn, normal_stop]
|
||||
|
||||
with (
|
||||
patch("run_agent.handle_function_call", return_value="search result"),
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello")
|
||||
|
||||
assert result["completed"] is True
|
||||
assert result["api_calls"] == 2
|
||||
assert result["final_response"] == "Based on the search results, the best next"
|
||||
|
||||
def test_length_thinking_exhausted_skips_continuation(self, agent):
|
||||
"""When finish_reason='length' but content is only thinking, skip retries."""
|
||||
self._setup_agent(agent)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue