diff --git a/agent/gemini_cloudcode_adapter.py b/agent/gemini_cloudcode_adapter.py
index 36ba288eb..ed687bffd 100644
--- a/agent/gemini_cloudcode_adapter.py
+++ b/agent/gemini_cloudcode_adapter.py
@@ -747,18 +747,149 @@ class GeminiCloudCodeClient:
 
 
 def _gemini_http_error(response: httpx.Response) -> CodeAssistError:
+    """Translate an httpx response into a CodeAssistError with rich metadata.
+
+    Parses Google's error envelope (``{"error": {"code", "message", "status",
+    "details": [...]}}``) so the agent's error classifier can reason about
+    the failure — ``status_code`` enables the rate_limit / auth classification
+    paths, and ``response`` lets the main loop honor ``Retry-After`` just
+    like it does for OpenAI SDK exceptions.
+
+    Also lifts a few recognizable Google conditions into human-readable
+    messages so the user sees something better than a 500-char JSON dump:
+
+        MODEL_CAPACITY_EXHAUSTED → "Gemini model capacity exhausted for
+            <model>. This is a Google-side throttle..."
+        RESOURCE_EXHAUSTED w/o reason → quota-style message
+        404 → "Model <name> not found at cloudcode-pa..."
+    """
     status = response.status_code
+
+    # Parse the body once, surviving any weird encodings.
+    body_text = ""
+    body_json: Dict[str, Any] = {}
     try:
-        body = response.text[:500]
+        body_text = response.text
     except Exception:
-        body = ""
-    # Let run_agent's retry logic see auth errors as rotatable via `api_key`
+        body_text = ""
+    if body_text:
+        try:
+            parsed = json.loads(body_text)
+            if isinstance(parsed, dict):
+                body_json = parsed
+        except (ValueError, TypeError):
+            body_json = {}
+
+    # Dig into Google's error envelope.  Shape is:
+    #   {"error": {"code": 429, "message": "...", "status": "RESOURCE_EXHAUSTED",
+    #              "details": [{"@type": ".../ErrorInfo", "reason": "MODEL_CAPACITY_EXHAUSTED",
+    #                           "metadata": {...}},
+    #                          {"@type": ".../RetryInfo", "retryDelay": "30s"}]}}
+    err_obj = body_json.get("error") if isinstance(body_json, dict) else None
+    if not isinstance(err_obj, dict):
+        err_obj = {}
+    err_status = str(err_obj.get("status") or "").strip()
+    err_message = str(err_obj.get("message") or "").strip()
+    err_details_list = err_obj.get("details") if isinstance(err_obj.get("details"), list) else []
+
+    # Extract google.rpc.ErrorInfo reason + metadata.  There may be more
+    # than one ErrorInfo (rare), so we pick the first one with a reason.
+    error_reason = ""
+    error_metadata: Dict[str, Any] = {}
+    retry_delay_seconds: Optional[float] = None
+    for detail in err_details_list:
+        if not isinstance(detail, dict):
+            continue
+        type_url = str(detail.get("@type") or "")
+        if not error_reason and type_url.endswith("/google.rpc.ErrorInfo"):
+            reason = detail.get("reason")
+            if isinstance(reason, str) and reason:
+                error_reason = reason
+            md = detail.get("metadata")
+            if isinstance(md, dict):
+                error_metadata = md
+        elif retry_delay_seconds is None and type_url.endswith("/google.rpc.RetryInfo"):
+            # retryDelay is a google.protobuf.Duration string like "30s" or "1.5s".
+            delay_raw = detail.get("retryDelay")
+            if isinstance(delay_raw, str) and delay_raw.endswith("s"):
+                try:
+                    retry_delay_seconds = float(delay_raw[:-1])
+                except ValueError:
+                    pass
+            elif isinstance(delay_raw, (int, float)):
+                retry_delay_seconds = float(delay_raw)
+
+    # Fall back to the Retry-After header if the body didn't include RetryInfo.
+    if retry_delay_seconds is None:
+        try:
+            header_val = response.headers.get("Retry-After") or response.headers.get("retry-after")
+        except Exception:
+            header_val = None
+        if header_val:
+            try:
+                retry_delay_seconds = float(header_val)
+            except (TypeError, ValueError):
+                retry_delay_seconds = None
+
+    # Classify the error code.  ``code_assist_rate_limited`` stays the default
+    # for 429s; a more specific reason tag helps downstream callers (e.g. tests,
+    # logs) without changing the rate_limit classification path.
     code = f"code_assist_http_{status}"
     if status == 401:
         code = "code_assist_unauthorized"
     elif status == 429:
         code = "code_assist_rate_limited"
+        if error_reason == "MODEL_CAPACITY_EXHAUSTED":
+            code = "code_assist_capacity_exhausted"
+
+    # Build a human-readable message.  Keep the status + a raw-body tail for
+    # debugging, but lead with a friendlier summary when we recognize the
+    # Google signal.
+    model_hint = ""
+    if isinstance(error_metadata, dict):
+        model_hint = str(error_metadata.get("model") or error_metadata.get("modelId") or "").strip()
+
+    if status == 429 and error_reason == "MODEL_CAPACITY_EXHAUSTED":
+        target = model_hint or "this Gemini model"
+        message = (
+            f"Gemini capacity exhausted for {target} (Google-side throttle, "
+            f"not a Hermes issue). Try a different Gemini model or set a "
+            f"fallback_providers entry to a non-Gemini provider."
+        )
+        if retry_delay_seconds is not None:
+            message += f" Google suggests retrying in {retry_delay_seconds:g}s."
+    elif status == 429 and err_status == "RESOURCE_EXHAUSTED":
+        message = (
+            f"Gemini quota exhausted ({err_message or 'RESOURCE_EXHAUSTED'}). "
+            f"Check /gquota for remaining daily requests."
+        )
+        if retry_delay_seconds is not None:
+            message += f" Retry suggested in {retry_delay_seconds:g}s."
+    elif status == 404:
+        # Google returns 404 when a model has been retired or renamed.
+        target = model_hint or (err_message or "model")
+        message = (
+            f"Code Assist 404: {target} is not available at "
+            f"cloudcode-pa.googleapis.com. It may have been renamed or "
+            f"retired. Check hermes_cli/models.py for the current list."
+        )
+    elif err_message:
+        # Generic fallback with the parsed message.
+        message = f"Code Assist HTTP {status} ({err_status or 'error'}): {err_message}"
+    else:
+        # Last-ditch fallback — raw body snippet.
+        message = f"Code Assist returned HTTP {status}: {body_text[:500]}"
+
     return CodeAssistError(
-        f"Code Assist returned HTTP {status}: {body}",
+        message,
         code=code,
+        status_code=status,
+        response=response,
+        retry_after=retry_delay_seconds,
+        details={
+            "status": err_status,
+            "reason": error_reason,
+            "metadata": error_metadata,
+            "message": err_message,
+        },
     )
diff --git a/agent/google_code_assist.py b/agent/google_code_assist.py
index 1acf3ea13..eba09b8f4 100644
--- a/agent/google_code_assist.py
+++ b/agent/google_code_assist.py
@@ -68,9 +68,45 @@ _ONBOARDING_POLL_INTERVAL_SECONDS = 5.0
 
 
 class CodeAssistError(RuntimeError):
-    def __init__(self, message: str, *, code: str = "code_assist_error") -> None:
+    """Exception raised by the Code Assist (``cloudcode-pa``) integration.
+
+    Carries HTTP status / response / retry-after metadata so the agent's
+    ``error_classifier._extract_status_code`` and the main loop's Retry-After
+    handling (which walks ``error.response.headers``) pick up the right
+    signals.  Without these, 429s from the OAuth path look like opaque
+    ``RuntimeError`` and skip the rate-limit path.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        code: str = "code_assist_error",
+        status_code: Optional[int] = None,
+        response: Any = None,
+        retry_after: Optional[float] = None,
+        details: Optional[Dict[str, Any]] = None,
+    ) -> None:
         super().__init__(message)
         self.code = code
+        # ``status_code`` is picked up by ``agent.error_classifier._extract_status_code``
+        # so a 429 from Code Assist classifies as FailoverReason.rate_limit and
+        # triggers the main loop's fallback_providers chain the same way SDK
+        # errors do.
+        self.status_code = status_code
+        # ``response`` is the underlying ``httpx.Response`` (or a shim with a
+        # ``.headers`` mapping and ``.json()`` method).  The main loop reads
+        # ``error.response.headers["Retry-After"]`` to honor Google's retry
+        # hints when the backend throttles us.
+        self.response = response
+        # Parsed ``Retry-After`` seconds (kept separately for convenience —
+        # Google returns retry hints in both the header and the error body's
+        # ``google.rpc.RetryInfo`` details, and we pick whichever we found).
+        self.retry_after = retry_after
+        # Parsed structured error details from the Google error envelope
+        # (e.g. ``{"reason": "MODEL_CAPACITY_EXHAUSTED", "status": "RESOURCE_EXHAUSTED"}``).
+        # Useful for logging and for tests that want to assert on specifics.
+        self.details = details or {}
 
 
 class ProjectIdRequiredError(CodeAssistError):
diff --git a/agent/model_metadata.py b/agent/model_metadata.py
index b30af6e48..81bac6c92 100644
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -125,7 +125,6 @@ DEFAULT_CONTEXT_LENGTHS = {
     "gemini": 1048576,
     # Gemma (open models served via AI Studio)
     "gemma-4-31b": 256000,
-    "gemma-4-26b": 256000,
     "gemma-3": 131072,
     "gemma": 8192,  # fallback for older gemma models
     # DeepSeek
diff --git a/hermes_cli/models.py b/hermes_cli/models.py
index 7a897cb79..d2859e01c 100644
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -135,7 +135,6 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
         "gemini-2.5-flash-lite",
         # Gemma open models (also served via AI Studio)
         "gemma-4-31b-it",
-        "gemma-4-26b-it",
     ],
     "google-gemini-cli": [
         "gemini-2.5-pro",
diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py
index 6b4fc5d73..95c9cae77 100644
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -91,7 +91,7 @@ _DEFAULT_PROVIDER_MODELS = {
     "gemini": [
         "gemini-3.1-pro-preview", "gemini-3-flash-preview", "gemini-3.1-flash-lite-preview",
         "gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite",
-        "gemma-4-31b-it", "gemma-4-26b-it",
+        "gemma-4-31b-it",
     ],
     "zai": ["glm-5.1", "glm-5", "glm-4.7", "glm-4.5", "glm-4.5-flash"],
     "kimi-coding": ["kimi-k2.5", "kimi-k2-thinking", "kimi-k2-turbo-preview"],
diff --git a/tests/agent/test_gemini_cloudcode.py b/tests/agent/test_gemini_cloudcode.py
index cf5e80f08..c9d2b87df 100644
--- a/tests/agent/test_gemini_cloudcode.py
+++ b/tests/agent/test_gemini_cloudcode.py
@@ -826,6 +826,160 @@ class TestGeminiCloudCodeClient:
         finally:
             client.close()
 
+
+class TestGeminiHttpErrorParsing:
+    """Regression coverage for _gemini_http_error Google-envelope parsing.
+
+    These are the paths that users actually hit during Google-side throttling
+    (April 2026: gemini-2.5-pro MODEL_CAPACITY_EXHAUSTED, gemma-4-26b-it
+    returning 404).  The error needs to carry status_code + response so the
+    main loop's error_classifier and Retry-After logic work.
+    """
+
+    @staticmethod
+    def _fake_response(status: int, body: dict | str = "", headers=None):
+        """Minimal httpx.Response stand-in (duck-typed for _gemini_http_error)."""
+        class _FakeResponse:
+            def __init__(self):
+                self.status_code = status
+                if isinstance(body, dict):
+                    self.text = json.dumps(body)
+                else:
+                    self.text = body
+                self.headers = headers or {}
+        return _FakeResponse()
+
+    def test_model_capacity_exhausted_produces_friendly_message(self):
+        from agent.gemini_cloudcode_adapter import _gemini_http_error
+
+        body = {
+            "error": {
+                "code": 429,
+                "message": "Resource has been exhausted (e.g. check quota).",
+                "status": "RESOURCE_EXHAUSTED",
+                "details": [
+                    {
+                        "@type": "type.googleapis.com/google.rpc.ErrorInfo",
+                        "reason": "MODEL_CAPACITY_EXHAUSTED",
+                        "domain": "googleapis.com",
+                        "metadata": {"model": "gemini-2.5-pro"},
+                    },
+                    {
+                        "@type": "type.googleapis.com/google.rpc.RetryInfo",
+                        "retryDelay": "30s",
+                    },
+                ],
+            }
+        }
+        err = _gemini_http_error(self._fake_response(429, body))
+        assert err.status_code == 429
+        assert err.code == "code_assist_capacity_exhausted"
+        assert err.retry_after == 30.0
+        assert err.details["reason"] == "MODEL_CAPACITY_EXHAUSTED"
+        # Message must be user-friendly, not a raw JSON dump.
+        message = str(err)
+        assert "gemini-2.5-pro" in message
+        assert "capacity exhausted" in message.lower()
+        assert "30s" in message
+        # response attr is preserved for run_agent's Retry-After header path.
+        assert err.response is not None
+
+    def test_resource_exhausted_without_reason(self):
+        from agent.gemini_cloudcode_adapter import _gemini_http_error
+
+        body = {
+            "error": {
+                "code": 429,
+                "message": "Quota exceeded for requests per minute.",
+                "status": "RESOURCE_EXHAUSTED",
+            }
+        }
+        err = _gemini_http_error(self._fake_response(429, body))
+        assert err.status_code == 429
+        assert err.code == "code_assist_rate_limited"
+        message = str(err)
+        assert "quota" in message.lower()
+
+    def test_404_model_not_found_produces_model_retired_message(self):
+        from agent.gemini_cloudcode_adapter import _gemini_http_error
+
+        body = {
+            "error": {
+                "code": 404,
+                "message": "models/gemma-4-26b-it is not found for API version v1internal",
+                "status": "NOT_FOUND",
+            }
+        }
+        err = _gemini_http_error(self._fake_response(404, body))
+        assert err.status_code == 404
+        message = str(err)
+        assert "not available" in message.lower() or "retired" in message.lower()
+        # Error message should reference the actual model text from Google.
+        assert "gemma-4-26b-it" in message
+
+    def test_unauthorized_preserves_status_code(self):
+        from agent.gemini_cloudcode_adapter import _gemini_http_error
+
+        err = _gemini_http_error(self._fake_response(
+            401, {"error": {"code": 401, "message": "Invalid token", "status": "UNAUTHENTICATED"}},
+        ))
+        assert err.status_code == 401
+        assert err.code == "code_assist_unauthorized"
+
+    def test_retry_after_header_fallback(self):
+        """If the body has no RetryInfo detail, fall back to Retry-After header."""
+        from agent.gemini_cloudcode_adapter import _gemini_http_error
+
+        resp = self._fake_response(
+            429,
+            {"error": {"code": 429, "message": "Rate limited", "status": "RESOURCE_EXHAUSTED"}},
+            headers={"Retry-After": "45"},
+        )
+        err = _gemini_http_error(resp)
+        assert err.retry_after == 45.0
+
+    def test_malformed_body_still_produces_structured_error(self):
+        """Non-JSON body must not swallow status_code — we still want the classifier path."""
+        from agent.gemini_cloudcode_adapter import _gemini_http_error
+
+        err = _gemini_http_error(self._fake_response(500, "<html>internal error</html>"))
+        assert err.status_code == 500
+        # Raw body snippet must still be there for debugging.
+        assert "500" in str(err)
+
+    def test_status_code_flows_through_error_classifier(self):
+        """End-to-end: CodeAssistError from a 429 must classify as rate_limit.
+
+        This is the whole point of adding status_code to CodeAssistError —
+        _extract_status_code must see it and FailoverReason.rate_limit must
+        fire, so the main loop triggers fallback_providers.
+        """
+        from agent.gemini_cloudcode_adapter import _gemini_http_error
+        from agent.error_classifier import classify_api_error, FailoverReason
+
+        body = {
+            "error": {
+                "code": 429,
+                "message": "Resource has been exhausted",
+                "status": "RESOURCE_EXHAUSTED",
+                "details": [
+                    {
+                        "@type": "type.googleapis.com/google.rpc.ErrorInfo",
+                        "reason": "MODEL_CAPACITY_EXHAUSTED",
+                        "metadata": {"model": "gemini-2.5-pro"},
+                    }
+                ],
+            }
+        }
+        err = _gemini_http_error(self._fake_response(429, body))
+
+        classified = classify_api_error(
+            err, provider="google-gemini-cli", model="gemini-2.5-pro",
+        )
+        assert classified.status_code == 429
+        assert classified.reason == FailoverReason.rate_limit
+
+
 # =============================================================================
 # Provider registration
 # =============================================================================
diff --git a/tests/hermes_cli/test_gemini_provider.py b/tests/hermes_cli/test_gemini_provider.py
index b448ca513..089a5cf98 100644
--- a/tests/hermes_cli/test_gemini_provider.py
+++ b/tests/hermes_cli/test_gemini_provider.py
@@ -178,10 +178,6 @@ class TestGeminiContextLength:
             ctx = get_model_context_length("gemma-4-31b-it", provider="gemini")
         assert ctx == 256000
 
-    def test_gemma_4_26b_context(self):
-        ctx = get_model_context_length("gemma-4-26b-it", provider="gemini")
-        assert ctx == 256000
-
     def test_gemini_3_context(self):
         ctx = get_model_context_length("gemini-3.1-pro-preview", provider="gemini")
         assert ctx == 1048576