fix: GLM reasoning-only and max-length handling (#3010)

- Add 'prompt exceeds max length' to context overflow detection for Z.AI/GLM 400 errors - Extract inline reasoning blocks from assistant content as fallback when no structured reasoning fields are present - Guard inline extraction so structured API reasoning takes priority - Update test for reasoning-only response salvage behavior Cherry-picked from PR #2993 by kshitijk4poor. Added priority guard to fix test_structured_reasoning_takes_priority failure. Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
2026-06-15 09:21:36 +00:00 · 2026-03-25 12:05:37 -07:00 · 2026-03-25 12:05:37 -07:00 · 099dfca6db
commit 099dfca6db
parent 68ab37e891
2 changed files with 68 additions and 5 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -1326,6 +1326,24 @@ class AIAgent:
                    summary = detail.get('summary') or detail.get('content') or detail.get('text')
                    if summary and summary not in reasoning_parts:
                        reasoning_parts.append(summary)
+
+        # Some providers embed reasoning directly inside assistant content
+        # instead of returning structured reasoning fields.  Only fall back
+        # to inline extraction when no structured reasoning was found.
+        content = getattr(assistant_message, "content", None)
+        if not reasoning_parts and isinstance(content, str) and content:
+            inline_patterns = (
+                r"<think>(.*?)</think>",
+                r"<thinking>(.*?)</thinking>",
+                r"<reasoning>(.*?)</reasoning>",
+                r"<REASONING_SCRATCHPAD>(.*?)</REASONING_SCRATCHPAD>",
+            )
+            for pattern in inline_patterns:
+                flags = re.DOTALL | re.IGNORECASE
+                for block in re.findall(pattern, content, flags=flags):
+                    cleaned = block.strip()
+                    if cleaned and cleaned not in reasoning_parts:
+                        reasoning_parts.append(cleaned)
        
        # Combine all reasoning parts
        if reasoning_parts:
@ -6392,6 +6410,7 @@ class AIAgent:
                        'exceeds the limit', 'context window',
                        'request entity too large',  # OpenRouter/Nous 413 safety net
                        'prompt is too long',  # Anthropic: "prompt is too long: N tokens > M maximum"
+                        'prompt exceeds max length',  # Z.AI / GLM: generic 400 overflow wording
                    ])

                    # Fallback heuristic: Anthropic sometimes returns a generic
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@ -267,6 +267,21 @@ class TestExtractReasoning:
        result = agent._extract_reasoning(msg)
        assert result == "same text"

+    @pytest.mark.parametrize(
+        ("content", "expected"),
+        [
+            ("<think>thinking hard</think>", "thinking hard"),
+            ("<thinking>step by step</thinking>", "step by step"),
+            (
+                "<REASONING_SCRATCHPAD>scratch analysis</REASONING_SCRATCHPAD>",
+                "scratch analysis",
+            ),
+        ],
+    )
+    def test_inline_reasoning_blocks_fallback(self, agent, content, expected):
+        msg = _mock_assistant_msg(content=content)
+        assert agent._extract_reasoning(msg) == expected
+

 class TestCleanSessionContent:
    def test_none_passthrough(self):
@ -1202,8 +1217,8 @@ class TestRunConversation:
        assert result["completed"] is True
        assert result["api_calls"] == 2

-    def test_empty_content_retry_and_fallback(self, agent):
-        """Empty content (only think block) retries, then falls back to partial."""
+    def test_empty_content_retry_uses_inline_reasoning_as_response(self, agent):
+        """Reasoning-only payloads should recover the inline reasoning text."""
        self._setup_agent(agent)
        empty_resp = _mock_response(
            content="<think>internal reasoning</think>",
@ -1221,9 +1236,8 @@ class TestRunConversation:
            patch.object(agent, "_cleanup_task_resources"),
        ):
            result = agent.run_conversation("answer me")
-        # After 3 retries with no real content, should return partial
-        assert result["completed"] is False
-        assert result.get("partial") is True
+        assert result["completed"] is True
+        assert result["final_response"] == "internal reasoning"

    def test_nous_401_refreshes_after_remint_and_retries(self, agent):
        self._setup_agent(agent)
@ -1296,6 +1310,36 @@ class TestRunConversation:
        assert result["final_response"] == "All done"
        assert result["completed"] is True

+    def test_glm_prompt_exceeds_max_length_triggers_compression(self, agent):
+        """GLM/Z.AI uses 'Prompt exceeds max length' for context overflow."""
+        self._setup_agent(agent)
+        err_400 = Exception(
+            "Error code: 400 - {'error': {'code': '1261', 'message': 'Prompt exceeds max length'}}"
+        )
+        err_400.status_code = 400
+        ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop")
+        agent.client.chat.completions.create.side_effect = [err_400, ok_resp]
+        prefill = [
+            {"role": "user", "content": "previous question"},
+            {"role": "assistant", "content": "previous answer"},
+        ]
+
+        with (
+            patch.object(agent, "_compress_context") as mock_compress,
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            mock_compress.return_value = (
+                [{"role": "user", "content": "hello"}],
+                "compressed system prompt",
+            )
+            result = agent.run_conversation("hello", conversation_history=prefill)
+
+        mock_compress.assert_called_once()
+        assert result["final_response"] == "Recovered after compression"
+        assert result["completed"] is True
+
    @pytest.mark.parametrize(
        ("first_content", "second_content", "expected_final"),
        [