diff --git a/hermes_cli/runtime_provider.py b/hermes_cli/runtime_provider.py index b00db5cf12..8c2979b6bc 100644 --- a/hermes_cli/runtime_provider.py +++ b/hermes_cli/runtime_provider.py @@ -24,6 +24,18 @@ def _normalize_custom_provider_name(value: str) -> str: return value.strip().lower().replace(" ", "-") +def _detect_api_mode_for_url(base_url: str) -> Optional[str]: + """Auto-detect api_mode from the resolved base URL. + + Direct api.openai.com endpoints need the Responses API for GPT-5.x + tool calls with reasoning (chat/completions returns 400). + """ + normalized = (base_url or "").strip().lower().rstrip("/") + if "api.openai.com" in normalized and "openrouter" not in normalized: + return "codex_responses" + return None + + def _auto_detect_local_model(base_url: str) -> str: """Query a local server for its model name when only one model is loaded.""" if not base_url: @@ -185,7 +197,9 @@ def _resolve_named_custom_runtime( return { "provider": "openrouter", - "api_mode": custom_provider.get("api_mode", "chat_completions"), + "api_mode": custom_provider.get("api_mode") + or _detect_api_mode_for_url(base_url) + or "chat_completions", "base_url": base_url, "api_key": api_key, "source": f"custom_provider:{custom_provider.get('name', requested_provider)}", @@ -263,7 +277,9 @@ def _resolve_openrouter_runtime( return { "provider": "openrouter", - "api_mode": _parse_api_mode(model_cfg.get("api_mode")) or "chat_completions", + "api_mode": _parse_api_mode(model_cfg.get("api_mode")) + or _detect_api_mode_for_url(base_url) + or "chat_completions", "base_url": base_url, "api_key": api_key, "source": source, diff --git a/run_agent.py b/run_agent.py index cb0855f878..e8365639bc 100644 --- a/run_agent.py +++ b/run_agent.py @@ -501,6 +501,12 @@ class AIAgent: else: self.api_mode = "chat_completions" + # Direct OpenAI sessions use the Responses API path. GPT-5.x tool + # calls with reasoning are rejected on /v1/chat/completions, and + # Hermes is a tool-using client by default. + if self.api_mode == "chat_completions" and self._is_direct_openai_url(): + self.api_mode = "codex_responses" + # Pre-warm OpenRouter model metadata cache in a background thread. # fetch_model_metadata() is cached for 1 hour; this avoids a blocking # HTTP request on the first API response when pricing is estimated. @@ -1080,6 +1086,11 @@ class AIAgent: return self._safe_print(*args, **kwargs) + def _is_direct_openai_url(self, base_url: str = None) -> bool: + """Return True when a base URL targets OpenAI's native API.""" + url = (base_url or self._base_url_lower).lower() + return "api.openai.com" in url and "openrouter" not in url + def _max_tokens_param(self, value: int) -> dict: """Return the correct max tokens kwarg for the current provider. @@ -1087,11 +1098,7 @@ class AIAgent: 'max_completion_tokens'. OpenRouter, local models, and older OpenAI models use 'max_tokens'. """ - _is_direct_openai = ( - "api.openai.com" in self._base_url_lower - and "openrouter" not in self._base_url_lower - ) - if _is_direct_openai: + if self._is_direct_openai_url(): return {"max_completion_tokens": value} return {"max_tokens": value} @@ -3553,13 +3560,15 @@ class AIAgent: fb_provider) return False - # Determine api_mode from provider + # Determine api_mode from provider / base URL fb_api_mode = "chat_completions" fb_base_url = str(fb_client.base_url) if fb_provider == "openai-codex": fb_api_mode = "codex_responses" elif fb_provider == "anthropic" or fb_base_url.rstrip("/").lower().endswith("/anthropic"): fb_api_mode = "anthropic_messages" + elif self._is_direct_openai_url(fb_base_url): + fb_api_mode = "codex_responses" old_model = self.model self.model = fb_model