fix: GLM reasoning-only and max-length handling (#3010)

- Add 'prompt exceeds max length' to context overflow detection for
  Z.AI/GLM 400 errors
- Extract inline reasoning blocks from assistant content as fallback
  when no structured reasoning fields are present
- Guard inline extraction so structured API reasoning takes priority
- Update test for reasoning-only response salvage behavior

Cherry-picked from PR #2993 by kshitijk4poor. Added priority guard
to fix test_structured_reasoning_takes_priority failure.

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
This commit is contained in:
Teknium 2026-03-25 12:05:37 -07:00 committed by GitHub
parent 68ab37e891
commit 099dfca6db
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 68 additions and 5 deletions

View file

@ -1326,6 +1326,24 @@ class AIAgent:
summary = detail.get('summary') or detail.get('content') or detail.get('text')
if summary and summary not in reasoning_parts:
reasoning_parts.append(summary)
# Some providers embed reasoning directly inside assistant content
# instead of returning structured reasoning fields. Only fall back
# to inline extraction when no structured reasoning was found.
content = getattr(assistant_message, "content", None)
if not reasoning_parts and isinstance(content, str) and content:
inline_patterns = (
r"<think>(.*?)</think>",
r"<thinking>(.*?)</thinking>",
r"<reasoning>(.*?)</reasoning>",
r"<REASONING_SCRATCHPAD>(.*?)</REASONING_SCRATCHPAD>",
)
for pattern in inline_patterns:
flags = re.DOTALL | re.IGNORECASE
for block in re.findall(pattern, content, flags=flags):
cleaned = block.strip()
if cleaned and cleaned not in reasoning_parts:
reasoning_parts.append(cleaned)
# Combine all reasoning parts
if reasoning_parts:
@ -6392,6 +6410,7 @@ class AIAgent:
'exceeds the limit', 'context window',
'request entity too large', # OpenRouter/Nous 413 safety net
'prompt is too long', # Anthropic: "prompt is too long: N tokens > M maximum"
'prompt exceeds max length', # Z.AI / GLM: generic 400 overflow wording
])
# Fallback heuristic: Anthropic sometimes returns a generic

View file

@ -267,6 +267,21 @@ class TestExtractReasoning:
result = agent._extract_reasoning(msg)
assert result == "same text"
@pytest.mark.parametrize(
("content", "expected"),
[
("<think>thinking hard</think>", "thinking hard"),
("<thinking>step by step</thinking>", "step by step"),
(
"<REASONING_SCRATCHPAD>scratch analysis</REASONING_SCRATCHPAD>",
"scratch analysis",
),
],
)
def test_inline_reasoning_blocks_fallback(self, agent, content, expected):
msg = _mock_assistant_msg(content=content)
assert agent._extract_reasoning(msg) == expected
class TestCleanSessionContent:
def test_none_passthrough(self):
@ -1202,8 +1217,8 @@ class TestRunConversation:
assert result["completed"] is True
assert result["api_calls"] == 2
def test_empty_content_retry_and_fallback(self, agent):
"""Empty content (only think block) retries, then falls back to partial."""
def test_empty_content_retry_uses_inline_reasoning_as_response(self, agent):
"""Reasoning-only payloads should recover the inline reasoning text."""
self._setup_agent(agent)
empty_resp = _mock_response(
content="<think>internal reasoning</think>",
@ -1221,9 +1236,8 @@ class TestRunConversation:
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("answer me")
# After 3 retries with no real content, should return partial
assert result["completed"] is False
assert result.get("partial") is True
assert result["completed"] is True
assert result["final_response"] == "internal reasoning"
def test_nous_401_refreshes_after_remint_and_retries(self, agent):
self._setup_agent(agent)
@ -1296,6 +1310,36 @@ class TestRunConversation:
assert result["final_response"] == "All done"
assert result["completed"] is True
def test_glm_prompt_exceeds_max_length_triggers_compression(self, agent):
"""GLM/Z.AI uses 'Prompt exceeds max length' for context overflow."""
self._setup_agent(agent)
err_400 = Exception(
"Error code: 400 - {'error': {'code': '1261', 'message': 'Prompt exceeds max length'}}"
)
err_400.status_code = 400
ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [err_400, ok_resp]
prefill = [
{"role": "user", "content": "previous question"},
{"role": "assistant", "content": "previous answer"},
]
with (
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
mock_compress.return_value = (
[{"role": "user", "content": "hello"}],
"compressed system prompt",
)
result = agent.run_conversation("hello", conversation_history=prefill)
mock_compress.assert_called_once()
assert result["final_response"] == "Recovered after compression"
assert result["completed"] is True
@pytest.mark.parametrize(
("first_content", "second_content", "expected_final"),
[