From c946e6709fa68b8a85f8e71709dd6642016b46a8 Mon Sep 17 00:00:00 2001
From: linyubin <linyubin@users.noreply.github.com>
Date: Sat, 27 Jun 2026 18:55:43 -0700
Subject: [PATCH] fix(agent): activate fallback on persistent transport
 failures (#22277)

Eager fallback previously fired only on rate_limit/billing. A stale-
detector-killed hung stream classifies as FailoverReason.timeout
(retryable=True) and the retry loop re-hit the same dead primary until
the budget exhausted -- 3 x ~180-300s stale kills compounding into a
15+ min silent hang while the configured fallback chain sat idle.

Extend the existing eager-fallback gate to also cover timeout and
overloaded, but only after one real retry (retry_count >= 2) so genuine
transient hiccups still recover on the primary. Reuses the same
pool-recovery guard and state-reset as the rate_limit branch -- no new
config flag, no change to the rate-limit intent.

Salvaged from PR #50228 by @linyubin. Closes #22277.

Co-authored-by: Hermes Agent <127238744+teknium1@users.noreply.github.com>
---
 agent/conversation_loop.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index 959a55dc0e9..6dba9e502a9 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -2853,15 +2853,25 @@ def run_conversation(
                     # Fall through to normal error handling if compression
                     # is exhausted or didn't help.
 
-                # Eager fallback for rate-limit errors (429 or quota exhaustion).
-                # When a fallback model is configured, switch immediately instead
-                # of burning through retries with exponential backoff -- the
-                # primary provider won't recover within the retry window.
+                # Eager fallback for rate-limit errors (429 or quota exhaustion)
+                # and transport errors (connection failure / timeout / provider
+                # overloaded).  Rate limits and billing: switch immediately —
+                # the primary provider won't recover within the retry window.
+                # Transport errors: allow 1 retry first (transient hiccups
+                # recover), then fall back if the provider is truly unreachable.
                 is_rate_limited = classified.reason in {
                     FailoverReason.rate_limit,
                     FailoverReason.billing,
                 }
-                if is_rate_limited and agent._fallback_index < len(agent._fallback_chain):
+                _is_transport_failure = classified.reason in {
+                    FailoverReason.timeout,
+                    FailoverReason.overloaded,
+                }
+                _should_fallback = (
+                    is_rate_limited
+                    or (_is_transport_failure and retry_count >= 2)
+                )
+                if _should_fallback and agent._fallback_index < len(agent._fallback_chain):
                     # Don't eagerly fallback if credential pool rotation may
                     # still recover.  See _pool_may_recover_from_rate_limit
                     # for the single-credential-pool and CloudCode-quota
@@ -2876,6 +2886,10 @@ def run_conversation(
                             agent._buffer_status(
                                 "⚠️ Billing or credits exhausted — switching to fallback provider..."
                             )
+                        elif _is_transport_failure:
+                            agent._buffer_status(
+                                "⚠️ Provider unreachable — switching to fallback provider..."
+                            )
                         else:
                             agent._buffer_status("⚠️ Rate limited — switching to fallback provider...")
                         if agent._try_activate_fallback(reason=classified.reason):