diff --git a/agent/gemini_cloudcode_adapter.py b/agent/gemini_cloudcode_adapter.py index 36ba288eb..ed687bffd 100644 --- a/agent/gemini_cloudcode_adapter.py +++ b/agent/gemini_cloudcode_adapter.py @@ -747,18 +747,149 @@ class GeminiCloudCodeClient: def _gemini_http_error(response: httpx.Response) -> CodeAssistError: + """Translate an httpx response into a CodeAssistError with rich metadata. + + Parses Google's error envelope (``{"error": {"code", "message", "status", + "details": [...]}}``) so the agent's error classifier can reason about + the failure — ``status_code`` enables the rate_limit / auth classification + paths, and ``response`` lets the main loop honor ``Retry-After`` just + like it does for OpenAI SDK exceptions. + + Also lifts a few recognizable Google conditions into human-readable + messages so the user sees something better than a 500-char JSON dump: + + MODEL_CAPACITY_EXHAUSTED → "Gemini model capacity exhausted for + . This is a Google-side throttle..." + RESOURCE_EXHAUSTED w/o reason → quota-style message + 404 → "Model not found at cloudcode-pa..." + """ status = response.status_code + + # Parse the body once, surviving any weird encodings. + body_text = "" + body_json: Dict[str, Any] = {} try: - body = response.text[:500] + body_text = response.text except Exception: - body = "" - # Let run_agent's retry logic see auth errors as rotatable via `api_key` + body_text = "" + if body_text: + try: + parsed = json.loads(body_text) + if isinstance(parsed, dict): + body_json = parsed + except (ValueError, TypeError): + body_json = {} + + # Dig into Google's error envelope. Shape is: + # {"error": {"code": 429, "message": "...", "status": "RESOURCE_EXHAUSTED", + # "details": [{"@type": ".../ErrorInfo", "reason": "MODEL_CAPACITY_EXHAUSTED", + # "metadata": {...}}, + # {"@type": ".../RetryInfo", "retryDelay": "30s"}]}} + err_obj = body_json.get("error") if isinstance(body_json, dict) else None + if not isinstance(err_obj, dict): + err_obj = {} + err_status = str(err_obj.get("status") or "").strip() + err_message = str(err_obj.get("message") or "").strip() + err_details_list = err_obj.get("details") if isinstance(err_obj.get("details"), list) else [] + + # Extract google.rpc.ErrorInfo reason + metadata. There may be more + # than one ErrorInfo (rare), so we pick the first one with a reason. + error_reason = "" + error_metadata: Dict[str, Any] = {} + retry_delay_seconds: Optional[float] = None + for detail in err_details_list: + if not isinstance(detail, dict): + continue + type_url = str(detail.get("@type") or "") + if not error_reason and type_url.endswith("/google.rpc.ErrorInfo"): + reason = detail.get("reason") + if isinstance(reason, str) and reason: + error_reason = reason + md = detail.get("metadata") + if isinstance(md, dict): + error_metadata = md + elif retry_delay_seconds is None and type_url.endswith("/google.rpc.RetryInfo"): + # retryDelay is a google.protobuf.Duration string like "30s" or "1.5s". + delay_raw = detail.get("retryDelay") + if isinstance(delay_raw, str) and delay_raw.endswith("s"): + try: + retry_delay_seconds = float(delay_raw[:-1]) + except ValueError: + pass + elif isinstance(delay_raw, (int, float)): + retry_delay_seconds = float(delay_raw) + + # Fall back to the Retry-After header if the body didn't include RetryInfo. + if retry_delay_seconds is None: + try: + header_val = response.headers.get("Retry-After") or response.headers.get("retry-after") + except Exception: + header_val = None + if header_val: + try: + retry_delay_seconds = float(header_val) + except (TypeError, ValueError): + retry_delay_seconds = None + + # Classify the error code. ``code_assist_rate_limited`` stays the default + # for 429s; a more specific reason tag helps downstream callers (e.g. tests, + # logs) without changing the rate_limit classification path. code = f"code_assist_http_{status}" if status == 401: code = "code_assist_unauthorized" elif status == 429: code = "code_assist_rate_limited" + if error_reason == "MODEL_CAPACITY_EXHAUSTED": + code = "code_assist_capacity_exhausted" + + # Build a human-readable message. Keep the status + a raw-body tail for + # debugging, but lead with a friendlier summary when we recognize the + # Google signal. + model_hint = "" + if isinstance(error_metadata, dict): + model_hint = str(error_metadata.get("model") or error_metadata.get("modelId") or "").strip() + + if status == 429 and error_reason == "MODEL_CAPACITY_EXHAUSTED": + target = model_hint or "this Gemini model" + message = ( + f"Gemini capacity exhausted for {target} (Google-side throttle, " + f"not a Hermes issue). Try a different Gemini model or set a " + f"fallback_providers entry to a non-Gemini provider." + ) + if retry_delay_seconds is not None: + message += f" Google suggests retrying in {retry_delay_seconds:g}s." + elif status == 429 and err_status == "RESOURCE_EXHAUSTED": + message = ( + f"Gemini quota exhausted ({err_message or 'RESOURCE_EXHAUSTED'}). " + f"Check /gquota for remaining daily requests." + ) + if retry_delay_seconds is not None: + message += f" Retry suggested in {retry_delay_seconds:g}s." + elif status == 404: + # Google returns 404 when a model has been retired or renamed. + target = model_hint or (err_message or "model") + message = ( + f"Code Assist 404: {target} is not available at " + f"cloudcode-pa.googleapis.com. It may have been renamed or " + f"retired. Check hermes_cli/models.py for the current list." + ) + elif err_message: + # Generic fallback with the parsed message. + message = f"Code Assist HTTP {status} ({err_status or 'error'}): {err_message}" + else: + # Last-ditch fallback — raw body snippet. + message = f"Code Assist returned HTTP {status}: {body_text[:500]}" + return CodeAssistError( - f"Code Assist returned HTTP {status}: {body}", + message, code=code, + status_code=status, + response=response, + retry_after=retry_delay_seconds, + details={ + "status": err_status, + "reason": error_reason, + "metadata": error_metadata, + "message": err_message, + }, ) diff --git a/agent/google_code_assist.py b/agent/google_code_assist.py index 1acf3ea13..eba09b8f4 100644 --- a/agent/google_code_assist.py +++ b/agent/google_code_assist.py @@ -68,9 +68,45 @@ _ONBOARDING_POLL_INTERVAL_SECONDS = 5.0 class CodeAssistError(RuntimeError): - def __init__(self, message: str, *, code: str = "code_assist_error") -> None: + """Exception raised by the Code Assist (``cloudcode-pa``) integration. + + Carries HTTP status / response / retry-after metadata so the agent's + ``error_classifier._extract_status_code`` and the main loop's Retry-After + handling (which walks ``error.response.headers``) pick up the right + signals. Without these, 429s from the OAuth path look like opaque + ``RuntimeError`` and skip the rate-limit path. + """ + + def __init__( + self, + message: str, + *, + code: str = "code_assist_error", + status_code: Optional[int] = None, + response: Any = None, + retry_after: Optional[float] = None, + details: Optional[Dict[str, Any]] = None, + ) -> None: super().__init__(message) self.code = code + # ``status_code`` is picked up by ``agent.error_classifier._extract_status_code`` + # so a 429 from Code Assist classifies as FailoverReason.rate_limit and + # triggers the main loop's fallback_providers chain the same way SDK + # errors do. + self.status_code = status_code + # ``response`` is the underlying ``httpx.Response`` (or a shim with a + # ``.headers`` mapping and ``.json()`` method). The main loop reads + # ``error.response.headers["Retry-After"]`` to honor Google's retry + # hints when the backend throttles us. + self.response = response + # Parsed ``Retry-After`` seconds (kept separately for convenience — + # Google returns retry hints in both the header and the error body's + # ``google.rpc.RetryInfo`` details, and we pick whichever we found). + self.retry_after = retry_after + # Parsed structured error details from the Google error envelope + # (e.g. ``{"reason": "MODEL_CAPACITY_EXHAUSTED", "status": "RESOURCE_EXHAUSTED"}``). + # Useful for logging and for tests that want to assert on specifics. + self.details = details or {} class ProjectIdRequiredError(CodeAssistError): diff --git a/agent/model_metadata.py b/agent/model_metadata.py index b30af6e48..81bac6c92 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -125,7 +125,6 @@ DEFAULT_CONTEXT_LENGTHS = { "gemini": 1048576, # Gemma (open models served via AI Studio) "gemma-4-31b": 256000, - "gemma-4-26b": 256000, "gemma-3": 131072, "gemma": 8192, # fallback for older gemma models # DeepSeek diff --git a/hermes_cli/models.py b/hermes_cli/models.py index 7a897cb79..d2859e01c 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -135,7 +135,6 @@ _PROVIDER_MODELS: dict[str, list[str]] = { "gemini-2.5-flash-lite", # Gemma open models (also served via AI Studio) "gemma-4-31b-it", - "gemma-4-26b-it", ], "google-gemini-cli": [ "gemini-2.5-pro", diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 6b4fc5d73..95c9cae77 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -91,7 +91,7 @@ _DEFAULT_PROVIDER_MODELS = { "gemini": [ "gemini-3.1-pro-preview", "gemini-3-flash-preview", "gemini-3.1-flash-lite-preview", "gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite", - "gemma-4-31b-it", "gemma-4-26b-it", + "gemma-4-31b-it", ], "zai": ["glm-5.1", "glm-5", "glm-4.7", "glm-4.5", "glm-4.5-flash"], "kimi-coding": ["kimi-k2.5", "kimi-k2-thinking", "kimi-k2-turbo-preview"], diff --git a/tests/agent/test_gemini_cloudcode.py b/tests/agent/test_gemini_cloudcode.py index cf5e80f08..c9d2b87df 100644 --- a/tests/agent/test_gemini_cloudcode.py +++ b/tests/agent/test_gemini_cloudcode.py @@ -826,6 +826,160 @@ class TestGeminiCloudCodeClient: finally: client.close() + +class TestGeminiHttpErrorParsing: + """Regression coverage for _gemini_http_error Google-envelope parsing. + + These are the paths that users actually hit during Google-side throttling + (April 2026: gemini-2.5-pro MODEL_CAPACITY_EXHAUSTED, gemma-4-26b-it + returning 404). The error needs to carry status_code + response so the + main loop's error_classifier and Retry-After logic work. + """ + + @staticmethod + def _fake_response(status: int, body: dict | str = "", headers=None): + """Minimal httpx.Response stand-in (duck-typed for _gemini_http_error).""" + class _FakeResponse: + def __init__(self): + self.status_code = status + if isinstance(body, dict): + self.text = json.dumps(body) + else: + self.text = body + self.headers = headers or {} + return _FakeResponse() + + def test_model_capacity_exhausted_produces_friendly_message(self): + from agent.gemini_cloudcode_adapter import _gemini_http_error + + body = { + "error": { + "code": 429, + "message": "Resource has been exhausted (e.g. check quota).", + "status": "RESOURCE_EXHAUSTED", + "details": [ + { + "@type": "type.googleapis.com/google.rpc.ErrorInfo", + "reason": "MODEL_CAPACITY_EXHAUSTED", + "domain": "googleapis.com", + "metadata": {"model": "gemini-2.5-pro"}, + }, + { + "@type": "type.googleapis.com/google.rpc.RetryInfo", + "retryDelay": "30s", + }, + ], + } + } + err = _gemini_http_error(self._fake_response(429, body)) + assert err.status_code == 429 + assert err.code == "code_assist_capacity_exhausted" + assert err.retry_after == 30.0 + assert err.details["reason"] == "MODEL_CAPACITY_EXHAUSTED" + # Message must be user-friendly, not a raw JSON dump. + message = str(err) + assert "gemini-2.5-pro" in message + assert "capacity exhausted" in message.lower() + assert "30s" in message + # response attr is preserved for run_agent's Retry-After header path. + assert err.response is not None + + def test_resource_exhausted_without_reason(self): + from agent.gemini_cloudcode_adapter import _gemini_http_error + + body = { + "error": { + "code": 429, + "message": "Quota exceeded for requests per minute.", + "status": "RESOURCE_EXHAUSTED", + } + } + err = _gemini_http_error(self._fake_response(429, body)) + assert err.status_code == 429 + assert err.code == "code_assist_rate_limited" + message = str(err) + assert "quota" in message.lower() + + def test_404_model_not_found_produces_model_retired_message(self): + from agent.gemini_cloudcode_adapter import _gemini_http_error + + body = { + "error": { + "code": 404, + "message": "models/gemma-4-26b-it is not found for API version v1internal", + "status": "NOT_FOUND", + } + } + err = _gemini_http_error(self._fake_response(404, body)) + assert err.status_code == 404 + message = str(err) + assert "not available" in message.lower() or "retired" in message.lower() + # Error message should reference the actual model text from Google. + assert "gemma-4-26b-it" in message + + def test_unauthorized_preserves_status_code(self): + from agent.gemini_cloudcode_adapter import _gemini_http_error + + err = _gemini_http_error(self._fake_response( + 401, {"error": {"code": 401, "message": "Invalid token", "status": "UNAUTHENTICATED"}}, + )) + assert err.status_code == 401 + assert err.code == "code_assist_unauthorized" + + def test_retry_after_header_fallback(self): + """If the body has no RetryInfo detail, fall back to Retry-After header.""" + from agent.gemini_cloudcode_adapter import _gemini_http_error + + resp = self._fake_response( + 429, + {"error": {"code": 429, "message": "Rate limited", "status": "RESOURCE_EXHAUSTED"}}, + headers={"Retry-After": "45"}, + ) + err = _gemini_http_error(resp) + assert err.retry_after == 45.0 + + def test_malformed_body_still_produces_structured_error(self): + """Non-JSON body must not swallow status_code — we still want the classifier path.""" + from agent.gemini_cloudcode_adapter import _gemini_http_error + + err = _gemini_http_error(self._fake_response(500, "internal error")) + assert err.status_code == 500 + # Raw body snippet must still be there for debugging. + assert "500" in str(err) + + def test_status_code_flows_through_error_classifier(self): + """End-to-end: CodeAssistError from a 429 must classify as rate_limit. + + This is the whole point of adding status_code to CodeAssistError — + _extract_status_code must see it and FailoverReason.rate_limit must + fire, so the main loop triggers fallback_providers. + """ + from agent.gemini_cloudcode_adapter import _gemini_http_error + from agent.error_classifier import classify_api_error, FailoverReason + + body = { + "error": { + "code": 429, + "message": "Resource has been exhausted", + "status": "RESOURCE_EXHAUSTED", + "details": [ + { + "@type": "type.googleapis.com/google.rpc.ErrorInfo", + "reason": "MODEL_CAPACITY_EXHAUSTED", + "metadata": {"model": "gemini-2.5-pro"}, + } + ], + } + } + err = _gemini_http_error(self._fake_response(429, body)) + + classified = classify_api_error( + err, provider="google-gemini-cli", model="gemini-2.5-pro", + ) + assert classified.status_code == 429 + assert classified.reason == FailoverReason.rate_limit + + # ============================================================================= # Provider registration # ============================================================================= diff --git a/tests/hermes_cli/test_gemini_provider.py b/tests/hermes_cli/test_gemini_provider.py index b448ca513..089a5cf98 100644 --- a/tests/hermes_cli/test_gemini_provider.py +++ b/tests/hermes_cli/test_gemini_provider.py @@ -178,10 +178,6 @@ class TestGeminiContextLength: ctx = get_model_context_length("gemma-4-31b-it", provider="gemini") assert ctx == 256000 - def test_gemma_4_26b_context(self): - ctx = get_model_context_length("gemma-4-26b-it", provider="gemini") - assert ctx == 256000 - def test_gemini_3_context(self): ctx = get_model_context_length("gemini-3.1-pro-preview", provider="gemini") assert ctx == 1048576