mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: GLM reasoning-only and max-length handling (#3010)
- Add 'prompt exceeds max length' to context overflow detection for Z.AI/GLM 400 errors - Extract inline reasoning blocks from assistant content as fallback when no structured reasoning fields are present - Guard inline extraction so structured API reasoning takes priority - Update test for reasoning-only response salvage behavior Cherry-picked from PR #2993 by kshitijk4poor. Added priority guard to fix test_structured_reasoning_takes_priority failure. Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
This commit is contained in:
parent
68ab37e891
commit
099dfca6db
2 changed files with 68 additions and 5 deletions
19
run_agent.py
19
run_agent.py
|
|
@ -1326,6 +1326,24 @@ class AIAgent:
|
|||
summary = detail.get('summary') or detail.get('content') or detail.get('text')
|
||||
if summary and summary not in reasoning_parts:
|
||||
reasoning_parts.append(summary)
|
||||
|
||||
# Some providers embed reasoning directly inside assistant content
|
||||
# instead of returning structured reasoning fields. Only fall back
|
||||
# to inline extraction when no structured reasoning was found.
|
||||
content = getattr(assistant_message, "content", None)
|
||||
if not reasoning_parts and isinstance(content, str) and content:
|
||||
inline_patterns = (
|
||||
r"<think>(.*?)</think>",
|
||||
r"<thinking>(.*?)</thinking>",
|
||||
r"<reasoning>(.*?)</reasoning>",
|
||||
r"<REASONING_SCRATCHPAD>(.*?)</REASONING_SCRATCHPAD>",
|
||||
)
|
||||
for pattern in inline_patterns:
|
||||
flags = re.DOTALL | re.IGNORECASE
|
||||
for block in re.findall(pattern, content, flags=flags):
|
||||
cleaned = block.strip()
|
||||
if cleaned and cleaned not in reasoning_parts:
|
||||
reasoning_parts.append(cleaned)
|
||||
|
||||
# Combine all reasoning parts
|
||||
if reasoning_parts:
|
||||
|
|
@ -6392,6 +6410,7 @@ class AIAgent:
|
|||
'exceeds the limit', 'context window',
|
||||
'request entity too large', # OpenRouter/Nous 413 safety net
|
||||
'prompt is too long', # Anthropic: "prompt is too long: N tokens > M maximum"
|
||||
'prompt exceeds max length', # Z.AI / GLM: generic 400 overflow wording
|
||||
])
|
||||
|
||||
# Fallback heuristic: Anthropic sometimes returns a generic
|
||||
|
|
|
|||
|
|
@ -267,6 +267,21 @@ class TestExtractReasoning:
|
|||
result = agent._extract_reasoning(msg)
|
||||
assert result == "same text"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("content", "expected"),
|
||||
[
|
||||
("<think>thinking hard</think>", "thinking hard"),
|
||||
("<thinking>step by step</thinking>", "step by step"),
|
||||
(
|
||||
"<REASONING_SCRATCHPAD>scratch analysis</REASONING_SCRATCHPAD>",
|
||||
"scratch analysis",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_inline_reasoning_blocks_fallback(self, agent, content, expected):
|
||||
msg = _mock_assistant_msg(content=content)
|
||||
assert agent._extract_reasoning(msg) == expected
|
||||
|
||||
|
||||
class TestCleanSessionContent:
|
||||
def test_none_passthrough(self):
|
||||
|
|
@ -1202,8 +1217,8 @@ class TestRunConversation:
|
|||
assert result["completed"] is True
|
||||
assert result["api_calls"] == 2
|
||||
|
||||
def test_empty_content_retry_and_fallback(self, agent):
|
||||
"""Empty content (only think block) retries, then falls back to partial."""
|
||||
def test_empty_content_retry_uses_inline_reasoning_as_response(self, agent):
|
||||
"""Reasoning-only payloads should recover the inline reasoning text."""
|
||||
self._setup_agent(agent)
|
||||
empty_resp = _mock_response(
|
||||
content="<think>internal reasoning</think>",
|
||||
|
|
@ -1221,9 +1236,8 @@ class TestRunConversation:
|
|||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("answer me")
|
||||
# After 3 retries with no real content, should return partial
|
||||
assert result["completed"] is False
|
||||
assert result.get("partial") is True
|
||||
assert result["completed"] is True
|
||||
assert result["final_response"] == "internal reasoning"
|
||||
|
||||
def test_nous_401_refreshes_after_remint_and_retries(self, agent):
|
||||
self._setup_agent(agent)
|
||||
|
|
@ -1296,6 +1310,36 @@ class TestRunConversation:
|
|||
assert result["final_response"] == "All done"
|
||||
assert result["completed"] is True
|
||||
|
||||
def test_glm_prompt_exceeds_max_length_triggers_compression(self, agent):
|
||||
"""GLM/Z.AI uses 'Prompt exceeds max length' for context overflow."""
|
||||
self._setup_agent(agent)
|
||||
err_400 = Exception(
|
||||
"Error code: 400 - {'error': {'code': '1261', 'message': 'Prompt exceeds max length'}}"
|
||||
)
|
||||
err_400.status_code = 400
|
||||
ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop")
|
||||
agent.client.chat.completions.create.side_effect = [err_400, ok_resp]
|
||||
prefill = [
|
||||
{"role": "user", "content": "previous question"},
|
||||
{"role": "assistant", "content": "previous answer"},
|
||||
]
|
||||
|
||||
with (
|
||||
patch.object(agent, "_compress_context") as mock_compress,
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
mock_compress.return_value = (
|
||||
[{"role": "user", "content": "hello"}],
|
||||
"compressed system prompt",
|
||||
)
|
||||
result = agent.run_conversation("hello", conversation_history=prefill)
|
||||
|
||||
mock_compress.assert_called_once()
|
||||
assert result["final_response"] == "Recovered after compression"
|
||||
assert result["completed"] is True
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("first_content", "second_content", "expected_final"),
|
||||
[
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue