diff --git a/gateway/run.py b/gateway/run.py index 77ab89acaa..04061ba42c 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -5584,15 +5584,43 @@ class GatewayRunner: # intermediate reasoning) so sessions can be resumed with full context # and transcripts are useful for debugging and training data. # - # IMPORTANT: When the agent failed (e.g. context-overflow 400, - # compression exhausted), do NOT persist the user's message. - # Persisting it would make the session even larger, causing the - # same failure on the next attempt — an infinite loop. (#1630, #9893) + # IMPORTANT: For context-overflow failures (compression exhausted, + # generic 400 on large sessions) we must NOT persist the user's + # message — doing so would grow the session further and cause the + # same failure on the next attempt, an infinite loop. (#1630, #9893) + # + # Transient failures (429, timeout, connection error, provider 5xx) + # are different: the session is not oversized, and silently dropping + # the user message causes severe context loss on retry — the agent + # forgets what was just asked. Persist the user turn so the + # conversation is preserved. (#7100) agent_failed_early = bool(agent_result.get("failed")) - if agent_failed_early: + _err_str_for_classify = str(agent_result.get("error", "")).lower() + # Use specific multi-word phrases (not bare "exceed" or "token") + # to avoid false positives on transient errors like "rate limit + # exceeded" or "invalid auth token". Matches run_agent.py's + # own context-length classifier. + is_context_overflow_failure = agent_failed_early and ( + bool(agent_result.get("compression_exhausted")) + or any(p in _err_str_for_classify for p in ( + "context length", "context size", "context window", + "maximum context", "token limit", "too many tokens", + "reduce the length", "exceeds the limit", + "request entity too large", "prompt is too long", + "payload too large", "input is too long", + )) + or ("400" in _err_str_for_classify and len(history) > 50) + ) + if is_context_overflow_failure: logger.info( - "Skipping transcript persistence for failed request in " - "session %s to prevent session growth loop.", + "Skipping transcript persistence for context-overflow " + "failure in session %s to prevent session growth loop.", + session_entry.session_id, + ) + elif agent_failed_early: + logger.info( + "Transient agent failure in session %s — persisting user " + "message so conversation context is preserved on retry.", session_entry.session_id, ) @@ -5622,7 +5650,7 @@ class GatewayRunner: # If this is a fresh session (no history), write the full tool # definitions as the first entry so the transcript is self-describing # -- the same list of dicts sent as tools=[...] in the API request. - if agent_failed_early: + if is_context_overflow_failure: pass # Skip all transcript writes — don't grow a broken session elif not history: tool_defs = agent_result.get("tools", []) @@ -5641,10 +5669,21 @@ class GatewayRunner: # Use the filtered history length (history_offset) that was actually # passed to the agent, not len(history) which includes session_meta # entries that were stripped before the agent saw them. - if not agent_failed_early: + if is_context_overflow_failure: + pass # handled above — skip all transcript writes + elif agent_failed_early: + # Transient failure (429/timeout/5xx): persist only the user + # message so the next message can load a transcript that + # reflects what was said. Skip the assistant error text since + # it's a gateway-generated hint, not model output. (#7100) + self.session_store.append_to_transcript( + session_entry.session_id, + {"role": "user", "content": message_text, "timestamp": ts}, + ) + else: history_len = agent_result.get("history_offset", len(history)) new_messages = agent_messages[history_len:] if len(agent_messages) > history_len else [] - + # If no new messages found (edge case), fall back to simple user/assistant if not new_messages: self.session_store.append_to_transcript( diff --git a/tests/gateway/test_7100_transient_failure_transcript.py b/tests/gateway/test_7100_transient_failure_transcript.py new file mode 100644 index 0000000000..3340dc28d5 --- /dev/null +++ b/tests/gateway/test_7100_transient_failure_transcript.py @@ -0,0 +1,137 @@ +"""Tests for #7100 — transient failures (429/timeout) must not drop the +user message from the transcript. + +The #1630 fix introduced a blanket skip of transcript writes on any +``failed`` agent result. That was correct for context-overflow failures +(which would otherwise cause a session-growth loop), but it also caused +transient provider failures (rate limits, read timeouts, connection +resets) to silently drop the user's message — so the agent had no memory +of the last turn on the next attempt. + +The gateway classifier must distinguish: + +* ``compression_exhausted=True`` OR context-keyword errors OR a generic + ``400`` on a long history → context-overflow → skip transcript +* everything else that fails → transient → persist the user message +""" + +import pytest + + +def _classify(agent_result: dict, history_len: int) -> tuple[bool, bool]: + """Replicate the gateway classifier from GatewayRunner._run_agent. + + Returns ``(agent_failed_early, is_context_overflow_failure)``. + """ + agent_failed_early = bool(agent_result.get("failed")) + err = str(agent_result.get("error", "")).lower() + is_context_overflow_failure = agent_failed_early and ( + bool(agent_result.get("compression_exhausted")) + or any(p in err for p in ( + "context length", "context size", "context window", + "maximum context", "token limit", "too many tokens", + "reduce the length", "exceeds the limit", + "request entity too large", "prompt is too long", + "payload too large", "input is too long", + )) + or ("400" in err and history_len > 50) + ) + return agent_failed_early, is_context_overflow_failure + + +class TestContextOverflowStillSkipsTranscript: + """#1630 behavior must be preserved for real context-overflow cases.""" + + def test_compression_exhausted_is_context_overflow(self): + agent_result = { + "failed": True, + "compression_exhausted": True, + "error": "Request payload too large: max compression attempts reached.", + } + failed, ctx_overflow = _classify(agent_result, history_len=100) + assert failed + assert ctx_overflow + + def test_explicit_context_length_error_is_context_overflow(self): + agent_result = { + "failed": True, + "error": "prompt is too long: 250000 tokens > 200000 maximum", + } + failed, ctx_overflow = _classify(agent_result, history_len=10) + assert failed + assert ctx_overflow + + def test_generic_400_on_large_session_is_context_overflow(self): + agent_result = { + "failed": True, + "error": "error code: 400 - {'type': 'error', 'message': 'Error'}", + } + failed, ctx_overflow = _classify(agent_result, history_len=100) + assert failed + assert ctx_overflow + + +class TestTransientFailureKeepsUserMessage: + """Transient provider failures must NOT skip the transcript — doing so + drops the user message and the agent forgets the turn. (#7100)""" + + def test_rate_limit_429_is_not_context_overflow(self): + agent_result = { + "failed": True, + "error": ( + "API call failed after 3 retries: 429 Too Many Requests " + "— rate limit exceeded" + ), + } + failed, ctx_overflow = _classify(agent_result, history_len=10) + assert failed + assert not ctx_overflow + + def test_read_timeout_is_not_context_overflow(self): + agent_result = { + "failed": True, + "error": "ReadTimeout: HTTPSConnectionPool(host='api.z.ai'): Read timed out.", + } + failed, ctx_overflow = _classify(agent_result, history_len=10) + assert failed + assert not ctx_overflow + + def test_connection_reset_is_not_context_overflow(self): + agent_result = { + "failed": True, + "error": "ConnectionError: [Errno 54] Connection reset by peer", + } + failed, ctx_overflow = _classify(agent_result, history_len=10) + assert failed + assert not ctx_overflow + + def test_provider_500_is_not_context_overflow(self): + agent_result = { + "failed": True, + "error": "API call failed after 3 retries: 500 Internal Server Error", + } + failed, ctx_overflow = _classify(agent_result, history_len=10) + assert failed + assert not ctx_overflow + + def test_generic_400_on_short_session_is_not_context_overflow(self): + """A 400 on a short session is a real client error, not context + overflow — still not a reason to drop the user turn.""" + agent_result = { + "failed": True, + "error": "error code: 400 - invalid model", + } + failed, ctx_overflow = _classify(agent_result, history_len=5) + assert failed + assert not ctx_overflow + + +class TestSuccessfulResultUnaffected: + def test_successful_result_neither_failed_nor_overflow(self): + agent_result = { + "final_response": "Hello!", + "messages": [{"role": "assistant", "content": "Hello!"}], + } + failed, ctx_overflow = _classify(agent_result, history_len=10) + assert not failed + assert not ctx_overflow