mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-03 12:23:08 +00:00
fix(agent): continue ollama glm truncation replies
This commit is contained in:
parent
1b61ec470b
commit
8011aa31ba
2 changed files with 172 additions and 0 deletions
64
run_agent.py
64
run_agent.py
|
|
@ -2103,6 +2103,59 @@ class AIAgent:
|
||||||
content = re.sub(r'</?(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>\s*', '', content, flags=re.IGNORECASE)
|
content = re.sub(r'</?(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>\s*', '', content, flags=re.IGNORECASE)
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _has_natural_response_ending(content: str) -> bool:
|
||||||
|
"""Heuristic: does visible assistant text look intentionally finished?"""
|
||||||
|
if not content:
|
||||||
|
return False
|
||||||
|
stripped = content.rstrip()
|
||||||
|
if not stripped:
|
||||||
|
return False
|
||||||
|
if stripped.endswith("```"):
|
||||||
|
return True
|
||||||
|
return stripped[-1] in '.!?:)"\']}。!?:)】」』》'
|
||||||
|
|
||||||
|
def _is_ollama_glm_backend(self) -> bool:
|
||||||
|
"""Detect the narrow backend family affected by Ollama/GLM stop misreports."""
|
||||||
|
model_lower = (self.model or "").lower()
|
||||||
|
provider_lower = (self.provider or "").lower()
|
||||||
|
if "glm" not in model_lower and provider_lower != "zai":
|
||||||
|
return False
|
||||||
|
if "ollama" in self._base_url_lower or ":11434" in self._base_url_lower:
|
||||||
|
return True
|
||||||
|
return bool(self.base_url and is_local_endpoint(self.base_url))
|
||||||
|
|
||||||
|
def _should_treat_stop_as_truncated(
|
||||||
|
self,
|
||||||
|
finish_reason: str,
|
||||||
|
assistant_message,
|
||||||
|
messages: Optional[list] = None,
|
||||||
|
) -> bool:
|
||||||
|
"""Detect conservative stop->length misreports for Ollama-hosted GLM models."""
|
||||||
|
if finish_reason != "stop" or self.api_mode != "chat_completions":
|
||||||
|
return False
|
||||||
|
if not self._is_ollama_glm_backend():
|
||||||
|
return False
|
||||||
|
if not any(
|
||||||
|
isinstance(msg, dict) and msg.get("role") == "tool"
|
||||||
|
for msg in (messages or [])
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
if assistant_message is None or getattr(assistant_message, "tool_calls", None):
|
||||||
|
return False
|
||||||
|
|
||||||
|
content = getattr(assistant_message, "content", None)
|
||||||
|
if not isinstance(content, str):
|
||||||
|
return False
|
||||||
|
|
||||||
|
visible_text = self._strip_think_blocks(content).strip()
|
||||||
|
if not visible_text:
|
||||||
|
return False
|
||||||
|
if len(visible_text) < 20 or not re.search(r"\s", visible_text):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return not self._has_natural_response_ending(visible_text)
|
||||||
|
|
||||||
def _looks_like_codex_intermediate_ack(
|
def _looks_like_codex_intermediate_ack(
|
||||||
self,
|
self,
|
||||||
user_message: str,
|
user_message: str,
|
||||||
|
|
@ -9038,6 +9091,17 @@ class AIAgent:
|
||||||
finish_reason = stop_reason_map.get(response.stop_reason, "stop")
|
finish_reason = stop_reason_map.get(response.stop_reason, "stop")
|
||||||
else:
|
else:
|
||||||
finish_reason = response.choices[0].finish_reason
|
finish_reason = response.choices[0].finish_reason
|
||||||
|
assistant_message = response.choices[0].message
|
||||||
|
if self._should_treat_stop_as_truncated(
|
||||||
|
finish_reason,
|
||||||
|
assistant_message,
|
||||||
|
messages,
|
||||||
|
):
|
||||||
|
self._vprint(
|
||||||
|
f"{self.log_prefix}⚠️ Treating suspicious Ollama/GLM stop response as truncated",
|
||||||
|
force=True,
|
||||||
|
)
|
||||||
|
finish_reason = "length"
|
||||||
|
|
||||||
if finish_reason == "length":
|
if finish_reason == "length":
|
||||||
self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True)
|
self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True)
|
||||||
|
|
|
||||||
|
|
@ -2202,6 +2202,114 @@ class TestRunConversation:
|
||||||
assert second_call_messages[-1]["role"] == "user"
|
assert second_call_messages[-1]["role"] == "user"
|
||||||
assert "truncated by the output length limit" in second_call_messages[-1]["content"]
|
assert "truncated by the output length limit" in second_call_messages[-1]["content"]
|
||||||
|
|
||||||
|
def test_ollama_glm_stop_after_tools_without_terminal_boundary_requests_continuation(self, agent):
|
||||||
|
"""Ollama-hosted GLM responses can misreport truncated output as stop."""
|
||||||
|
self._setup_agent(agent)
|
||||||
|
agent.base_url = "http://localhost:11434/v1"
|
||||||
|
agent._base_url_lower = agent.base_url.lower()
|
||||||
|
agent.model = "glm-5.1:cloud"
|
||||||
|
|
||||||
|
tool_turn = _mock_response(
|
||||||
|
content="",
|
||||||
|
finish_reason="tool_calls",
|
||||||
|
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
|
||||||
|
)
|
||||||
|
misreported_stop = _mock_response(
|
||||||
|
content="Based on the search results, the best next",
|
||||||
|
finish_reason="stop",
|
||||||
|
)
|
||||||
|
continued = _mock_response(
|
||||||
|
content=" step is to update the config.",
|
||||||
|
finish_reason="stop",
|
||||||
|
)
|
||||||
|
agent.client.chat.completions.create.side_effect = [
|
||||||
|
tool_turn,
|
||||||
|
misreported_stop,
|
||||||
|
continued,
|
||||||
|
]
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("run_agent.handle_function_call", return_value="search result"),
|
||||||
|
patch.object(agent, "_persist_session"),
|
||||||
|
patch.object(agent, "_save_trajectory"),
|
||||||
|
patch.object(agent, "_cleanup_task_resources"),
|
||||||
|
):
|
||||||
|
result = agent.run_conversation("hello")
|
||||||
|
|
||||||
|
assert result["completed"] is True
|
||||||
|
assert result["api_calls"] == 3
|
||||||
|
assert (
|
||||||
|
result["final_response"]
|
||||||
|
== "Based on the search results, the best next step is to update the config."
|
||||||
|
)
|
||||||
|
|
||||||
|
third_call_messages = agent.client.chat.completions.create.call_args_list[2].kwargs["messages"]
|
||||||
|
assert third_call_messages[-1]["role"] == "user"
|
||||||
|
assert "truncated by the output length limit" in third_call_messages[-1]["content"]
|
||||||
|
|
||||||
|
def test_ollama_glm_stop_with_terminal_boundary_does_not_continue(self, agent):
|
||||||
|
"""Complete Ollama/GLM responses should not be reclassified as truncated."""
|
||||||
|
self._setup_agent(agent)
|
||||||
|
agent.base_url = "http://localhost:11434/v1"
|
||||||
|
agent._base_url_lower = agent.base_url.lower()
|
||||||
|
agent.model = "glm-5.1:cloud"
|
||||||
|
|
||||||
|
tool_turn = _mock_response(
|
||||||
|
content="",
|
||||||
|
finish_reason="tool_calls",
|
||||||
|
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
|
||||||
|
)
|
||||||
|
complete_stop = _mock_response(
|
||||||
|
content="Based on the search results, the best next step is to update the config.",
|
||||||
|
finish_reason="stop",
|
||||||
|
)
|
||||||
|
agent.client.chat.completions.create.side_effect = [tool_turn, complete_stop]
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("run_agent.handle_function_call", return_value="search result"),
|
||||||
|
patch.object(agent, "_persist_session"),
|
||||||
|
patch.object(agent, "_save_trajectory"),
|
||||||
|
patch.object(agent, "_cleanup_task_resources"),
|
||||||
|
):
|
||||||
|
result = agent.run_conversation("hello")
|
||||||
|
|
||||||
|
assert result["completed"] is True
|
||||||
|
assert result["api_calls"] == 2
|
||||||
|
assert (
|
||||||
|
result["final_response"]
|
||||||
|
== "Based on the search results, the best next step is to update the config."
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_non_ollama_stop_without_terminal_boundary_does_not_continue(self, agent):
|
||||||
|
"""The stop->length workaround should stay scoped to Ollama/GLM backends."""
|
||||||
|
self._setup_agent(agent)
|
||||||
|
agent.base_url = "https://api.openai.com/v1"
|
||||||
|
agent._base_url_lower = agent.base_url.lower()
|
||||||
|
agent.model = "gpt-4o-mini"
|
||||||
|
|
||||||
|
tool_turn = _mock_response(
|
||||||
|
content="",
|
||||||
|
finish_reason="tool_calls",
|
||||||
|
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
|
||||||
|
)
|
||||||
|
normal_stop = _mock_response(
|
||||||
|
content="Based on the search results, the best next",
|
||||||
|
finish_reason="stop",
|
||||||
|
)
|
||||||
|
agent.client.chat.completions.create.side_effect = [tool_turn, normal_stop]
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("run_agent.handle_function_call", return_value="search result"),
|
||||||
|
patch.object(agent, "_persist_session"),
|
||||||
|
patch.object(agent, "_save_trajectory"),
|
||||||
|
patch.object(agent, "_cleanup_task_resources"),
|
||||||
|
):
|
||||||
|
result = agent.run_conversation("hello")
|
||||||
|
|
||||||
|
assert result["completed"] is True
|
||||||
|
assert result["api_calls"] == 2
|
||||||
|
assert result["final_response"] == "Based on the search results, the best next"
|
||||||
|
|
||||||
def test_length_thinking_exhausted_skips_continuation(self, agent):
|
def test_length_thinking_exhausted_skips_continuation(self, agent):
|
||||||
"""When finish_reason='length' but content is only thinking, skip retries."""
|
"""When finish_reason='length' but content is only thinking, skip retries."""
|
||||||
self._setup_agent(agent)
|
self._setup_agent(agent)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue