mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-02 12:13:05 +00:00
fix(agent): scope Ollama/GLM stop-to-length heuristic to Ollama only
The _is_ollama_glm_backend() function was too broad: any local endpoint
running a GLM model was treated as Ollama, triggering the stop->length
misreport heuristic introduced in 8011aa3. This caused false truncation
detection on sglang, vLLM, LM Studio, and other non-Ollama servers that
correctly report finish_reason.
When a GLM model on sglang/vLLM returned finish_reason='stop', the agent
mistakenly reclassified it as 'length' if the response didn't end with
a whitelisted punctuation character (ASCII or CJK). This particularly
affected Chinese-language responses and Markdown-formatted text.
Root cause: the is_local_endpoint() fallback assumed any local GLM
endpoint = Ollama. But many non-Ollama servers also run on localhost.
Fix: remove the is_local_endpoint() catch-all. Only detect Ollama via
its distinctive signatures (port 11434, 'ollama' in URL). All other
local servers are assumed to report finish_reason correctly.
This is the correct tradeoff because:
- False negatives (Ollama at custom port, heuristic not triggered) only
mean the user sees a truncated response — same as having no heuristic
- False positives (non-Ollama server, heuristic wrongly triggered) inject
spurious continuation messages into the conversation — strictly worse
Adds two tests:
- sglang GLM response is NOT reclassified as truncated
- Ollama GLM on port 11434 still triggers the heuristic as before
Co-authored-by: Hermes Agent <hermes@nousresearch.com>
This commit is contained in:
parent
ab1f9b94c5
commit
00a8252b7d
2 changed files with 93 additions and 4 deletions
|
|
@ -4526,6 +4526,83 @@ class TestRunConversation:
|
|||
assert result["api_calls"] == 2
|
||||
assert result["final_response"] == "Based on the search results, the best next"
|
||||
|
||||
def test_sglang_glm_stop_without_terminal_boundary_does_not_continue(self, agent):
|
||||
"""sglang/vLLM-hosted GLM models report finish_reason correctly.
|
||||
|
||||
The stop->length workaround must NOT apply to non-Ollama local
|
||||
servers that expose OpenAI-compatible /v1 endpoints (sglang, vLLM,
|
||||
LM Studio, etc.). A Chinese-text response ending without ASCII
|
||||
punctuation should not be reclassified as truncated.
|
||||
"""
|
||||
self._setup_agent(agent)
|
||||
agent.base_url = "http://127.0.0.1:60000/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "glm-5-fp8"
|
||||
|
||||
tool_turn = _mock_response(
|
||||
content="",
|
||||
finish_reason="tool_calls",
|
||||
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
|
||||
)
|
||||
# Response ends with Chinese character (no ASCII punctuation) — NOT truncated
|
||||
normal_stop = _mock_response(
|
||||
content="根据搜索结果,建议修改配置",
|
||||
finish_reason="stop",
|
||||
)
|
||||
agent.client.chat.completions.create.side_effect = [tool_turn, normal_stop]
|
||||
|
||||
with (
|
||||
patch("run_agent.handle_function_call", return_value="search result"),
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello")
|
||||
|
||||
assert result["completed"] is True
|
||||
assert result["api_calls"] == 2
|
||||
assert result["final_response"] == "根据搜索结果,建议修改配置"
|
||||
|
||||
def test_ollama_glm_on_port_11434_still_triggers_heuristic(self, agent):
|
||||
"""Ollama on port 11434 should still trigger the stop->length heuristic."""
|
||||
self._setup_agent(agent)
|
||||
agent.base_url = "http://localhost:11434/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "glm-5.1:cloud"
|
||||
|
||||
tool_turn = _mock_response(
|
||||
content="",
|
||||
finish_reason="tool_calls",
|
||||
tool_calls=[_mock_tool_call(name="web_search", arguments="{}", call_id="c1")],
|
||||
)
|
||||
misreported_stop = _mock_response(
|
||||
content="Based on the search results, the best next",
|
||||
finish_reason="stop",
|
||||
)
|
||||
continued = _mock_response(
|
||||
content=" step is to update the config.",
|
||||
finish_reason="stop",
|
||||
)
|
||||
agent.client.chat.completions.create.side_effect = [
|
||||
tool_turn,
|
||||
misreported_stop,
|
||||
continued,
|
||||
]
|
||||
|
||||
with (
|
||||
patch("run_agent.handle_function_call", return_value="search result"),
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello")
|
||||
|
||||
assert result["completed"] is True
|
||||
assert result["api_calls"] == 3
|
||||
third_call_messages = agent.client.chat.completions.create.call_args_list[2].kwargs["messages"]
|
||||
assert "truncated by the output length limit" in third_call_messages[-1]["content"]
|
||||
|
||||
|
||||
def test_length_thinking_exhausted_skips_continuation(self, agent):
|
||||
"""When finish_reason='length' but content is only thinking, skip retries."""
|
||||
self._setup_agent(agent)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue