From 0c1217d01ec3a8420391e14ea859f97c95ee624d Mon Sep 17 00:00:00 2001 From: Teknium Date: Wed, 15 Apr 2026 22:27:26 -0700 Subject: [PATCH] feat(xai): upgrade to Responses API, add TTS provider Cherry-picked and trimmed from PR #10600 by Jaaneek. - Switch xAI transport from openai_chat to codex_responses (Responses API) - Add codex_responses detection for xAI in all runtime_provider resolution paths - Add xAI api_mode detection in AIAgent.__init__ (provider name + URL auto-detect) - Add extra_headers passthrough for codex_responses requests - Add x-grok-conv-id session header for xAI prompt caching - Add xAI reasoning support (encrypted_content include, no effort param) - Move x-grok-conv-id from chat_completions path to codex_responses path - Add xAI TTS provider (dedicated /v1/tts endpoint with Opus conversion) - Add xAI provider aliases (grok, x-ai, x.ai) across auth, models, providers, auxiliary - Trim xAI model list to agentic models (grok-4.20-reasoning, grok-4-1-fast-reasoning) - Add XAI_API_KEY/XAI_BASE_URL to OPTIONAL_ENV_VARS - Add xAI TTS config section, setup wizard entry, tools_config provider option - Add shared xai_http.py helper for User-Agent string Co-authored-by: Jaaneek --- agent/auxiliary_client.py | 3 ++ hermes_cli/auth.py | 1 + hermes_cli/config.py | 24 +++++++++- hermes_cli/main.py | 2 +- hermes_cli/models.py | 11 +---- hermes_cli/nous_subscription.py | 1 + hermes_cli/providers.py | 3 +- hermes_cli/runtime_provider.py | 8 ++++ hermes_cli/setup.py | 21 ++++++++- hermes_cli/tools_config.py | 8 ++++ run_agent.py | 38 ++++++++++++---- tools/tts_tool.py | 79 ++++++++++++++++++++++++++++++++- tools/xai_http.py | 12 +++++ toolsets.py | 2 +- 14 files changed, 189 insertions(+), 24 deletions(-) create mode 100644 tools/xai_http.py diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 34d7d4250..bc6b1efbe 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -58,6 +58,9 @@ _PROVIDER_ALIASES = { "google": "gemini", "google-gemini": "gemini", "google-ai-studio": "gemini", + "x-ai": "xai", + "x.ai": "xai", + "grok": "xai", "glm": "zai", "z-ai": "zai", "z.ai": "zai", diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py index 966082787..556e26f97 100644 --- a/hermes_cli/auth.py +++ b/hermes_cli/auth.py @@ -928,6 +928,7 @@ def resolve_provider( _PROVIDER_ALIASES = { "glm": "zai", "z-ai": "zai", "z.ai": "zai", "zhipu": "zai", "google": "gemini", "google-gemini": "gemini", "google-ai-studio": "gemini", + "x-ai": "xai", "x.ai": "xai", "grok": "xai", "kimi": "kimi-coding", "kimi-for-coding": "kimi-coding", "moonshot": "kimi-coding", "kimi-cn": "kimi-coding-cn", "moonshot-cn": "kimi-coding-cn", "arcee-ai": "arcee", "arceeai": "arcee", diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 7f639726f..a85997f8f 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -566,7 +566,7 @@ DEFAULT_CONFIG = { # Text-to-speech configuration "tts": { - "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local) + "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local) "edge": { "voice": "en-US-AriaNeural", # Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural @@ -580,6 +580,12 @@ DEFAULT_CONFIG = { "voice": "alloy", # Voices: alloy, echo, fable, onyx, nova, shimmer }, + "xai": { + "voice_id": "eve", + "language": "en", + "sample_rate": 24000, + "bit_rate": 128000, + }, "mistral": { "model": "voxtral-mini-tts-2603", "voice_id": "c69964a6-ab8b-4f8a-9465-ec0925096ec8", # Paul - Neutral @@ -836,6 +842,22 @@ OPTIONAL_ENV_VARS = { "category": "provider", "advanced": True, }, + "XAI_API_KEY": { + "description": "xAI API key", + "prompt": "xAI API key", + "url": "https://console.x.ai/", + "password": True, + "category": "provider", + "advanced": True, + }, + "XAI_BASE_URL": { + "description": "xAI base URL override", + "prompt": "xAI base URL (leave empty for default)", + "url": None, + "password": False, + "category": "provider", + "advanced": True, + }, "GLM_API_KEY": { "description": "Z.AI / GLM API key (also recognized as ZAI_API_KEY / Z_AI_API_KEY)", "prompt": "Z.AI / GLM API key", diff --git a/hermes_cli/main.py b/hermes_cli/main.py index f7b95ff38..d1ee08c49 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -4890,7 +4890,7 @@ For more help on a command: ) chat_parser.add_argument( "--provider", - choices=["auto", "openrouter", "nous", "openai-codex", "copilot-acp", "copilot", "anthropic", "gemini", "ollama-cloud", "huggingface", "zai", "kimi-coding", "kimi-coding-cn", "minimax", "minimax-cn", "kilocode", "xiaomi", "arcee"], + choices=["auto", "openrouter", "nous", "openai-codex", "copilot-acp", "copilot", "anthropic", "gemini", "xai", "ollama-cloud", "huggingface", "zai", "kimi-coding", "kimi-coding-cn", "minimax", "minimax-cn", "kilocode", "xiaomi", "arcee"], default=None, help="Inference provider (default: auto)" ) diff --git a/hermes_cli/models.py b/hermes_cli/models.py index 9812fc97e..a298dc99c 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -145,17 +145,8 @@ _PROVIDER_MODELS: dict[str, list[str]] = { "glm-4.5-flash", ], "xai": [ - "grok-4.20-0309-reasoning", - "grok-4.20-0309-non-reasoning", - "grok-4.20-multi-agent-0309", + "grok-4.20-reasoning", "grok-4-1-fast-reasoning", - "grok-4-1-fast-non-reasoning", - "grok-4-fast-reasoning", - "grok-4-fast-non-reasoning", - "grok-4-0709", - "grok-code-fast-1", - "grok-3", - "grok-3-mini", ], "kimi-coding": [ "kimi-for-coding", diff --git a/hermes_cli/nous_subscription.py b/hermes_cli/nous_subscription.py index f1e4366c1..e182b37e7 100644 --- a/hermes_cli/nous_subscription.py +++ b/hermes_cli/nous_subscription.py @@ -143,6 +143,7 @@ def _tts_label(current_provider: str) -> str: "openai": "OpenAI TTS", "elevenlabs": "ElevenLabs", "edge": "Edge TTS", + "xai": "xAI TTS", "mistral": "Mistral Voxtral TTS", "neutts": "NeuTTS", } diff --git a/hermes_cli/providers.py b/hermes_cli/providers.py index eae832055..8b5b35fe5 100644 --- a/hermes_cli/providers.py +++ b/hermes_cli/providers.py @@ -128,7 +128,7 @@ HERMES_OVERLAYS: Dict[str, HermesOverlay] = { base_url_env_var="HF_BASE_URL", ), "xai": HermesOverlay( - transport="openai_chat", + transport="codex_responses", base_url_override="https://api.x.ai/v1", base_url_env_var="XAI_BASE_URL", ), @@ -184,6 +184,7 @@ ALIASES: Dict[str, str] = { # xai "x-ai": "xai", "x.ai": "xai", + "grok": "xai", # kimi-for-coding (models.dev ID) "kimi": "kimi-for-coding", diff --git a/hermes_cli/runtime_provider.py b/hermes_cli/runtime_provider.py index 33b35562f..ffd97a6ca 100644 --- a/hermes_cli/runtime_provider.py +++ b/hermes_cli/runtime_provider.py @@ -41,6 +41,8 @@ def _detect_api_mode_for_url(base_url: str) -> Optional[str]: tool calls with reasoning (chat/completions returns 400). """ normalized = (base_url or "").strip().lower().rstrip("/") + if "api.x.ai" in normalized: + return "codex_responses" if "api.openai.com" in normalized and "openrouter" not in normalized: return "codex_responses" return None @@ -163,6 +165,8 @@ def _resolve_runtime_from_pool_entry( base_url = cfg_base_url or base_url or "https://api.anthropic.com" elif provider == "openrouter": base_url = base_url or OPENROUTER_BASE_URL + elif provider == "xai": + api_mode = "codex_responses" elif provider == "nous": api_mode = "chat_completions" elif provider == "copilot": @@ -628,6 +632,8 @@ def _resolve_explicit_runtime( api_mode = "chat_completions" if provider == "copilot": api_mode = _copilot_runtime_api_mode(model_cfg, api_key) + elif provider == "xai": + api_mode = "codex_responses" else: configured_mode = _parse_api_mode(model_cfg.get("api_mode")) if configured_mode: @@ -924,6 +930,8 @@ def resolve_runtime_provider( api_mode = "chat_completions" if provider == "copilot": api_mode = _copilot_runtime_api_mode(model_cfg, creds.get("api_key", "")) + elif provider == "xai": + api_mode = "codex_responses" else: configured_provider = str(model_cfg.get("provider") or "").strip().lower() # Only honor persisted api_mode when it belongs to the same provider family. diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 52f6e36d6..eafe3b633 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -920,6 +920,7 @@ def _setup_tts_provider(config: dict): "edge": "Edge TTS", "elevenlabs": "ElevenLabs", "openai": "OpenAI TTS", + "xai": "xAI TTS", "minimax": "MiniMax TTS", "mistral": "Mistral Voxtral TTS", "neutts": "NeuTTS", @@ -941,12 +942,13 @@ def _setup_tts_provider(config: dict): "Edge TTS (free, cloud-based, no setup needed)", "ElevenLabs (premium quality, needs API key)", "OpenAI TTS (good quality, needs API key)", + "xAI TTS (Grok voices, needs API key)", "MiniMax TTS (high quality with voice cloning, needs API key)", "Mistral Voxtral TTS (multilingual, native Opus, needs API key)", "NeuTTS (local on-device, free, ~300MB model download)", ] ) - providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts"]) + providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts"]) choices.append(f"Keep current ({current_label})") keep_current_idx = len(choices) - 1 idx = prompt_choice("Select TTS provider:", choices, keep_current_idx) @@ -1012,6 +1014,23 @@ def _setup_tts_provider(config: dict): print_warning("No API key provided. Falling back to Edge TTS.") selected = "edge" + elif selected == "xai": + existing = get_env_value("XAI_API_KEY") + if not existing: + print() + api_key = prompt("xAI API key for TTS", password=True) + if api_key: + save_env_value("XAI_API_KEY", api_key) + print_success("xAI TTS API key saved") + else: + from hermes_constants import display_hermes_home as _dhh + print_warning( + "No xAI API key provided for TTS. Configure XAI_API_KEY via " + f"hermes setup model or {_dhh()}/.env to use xAI TTS. " + "Falling back to Edge TTS." + ) + selected = "edge" + elif selected == "minimax": existing = get_env_value("MINIMAX_API_KEY") if not existing: diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index 5fe8cdc79..0609e7ff4 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -146,6 +146,14 @@ TOOL_CATEGORIES = { ], "tts_provider": "openai", }, + { + "name": "xAI TTS", + "tag": "Grok voices - requires xAI API key", + "env_vars": [ + {"key": "XAI_API_KEY", "prompt": "xAI API key", "url": "https://console.x.ai/"}, + ], + "tts_provider": "xai", + }, { "name": "ElevenLabs", "badge": "paid", diff --git a/run_agent.py b/run_agent.py index 2781bf188..cb5dbf4b1 100644 --- a/run_agent.py +++ b/run_agent.py @@ -691,9 +691,14 @@ class AIAgent: self.api_mode = api_mode elif self.provider == "openai-codex": self.api_mode = "codex_responses" + elif self.provider == "xai": + self.api_mode = "codex_responses" elif (provider_name is None) and "chatgpt.com/backend-api/codex" in self._base_url_lower: self.api_mode = "codex_responses" self.provider = "openai-codex" + elif (provider_name is None) and "api.x.ai" in self._base_url_lower: + self.api_mode = "codex_responses" + self.provider = "xai" elif self.provider == "anthropic" or (provider_name is None and "api.anthropic.com" in self._base_url_lower): self.api_mode = "anthropic_messages" self.provider = "anthropic" @@ -4032,6 +4037,7 @@ class AIAgent: "model", "instructions", "input", "tools", "store", "reasoning", "include", "max_output_tokens", "temperature", "tool_choice", "parallel_tool_calls", "prompt_cache_key", "service_tier", + "extra_headers", } normalized: Dict[str, Any] = { "model": model, @@ -4067,6 +4073,20 @@ class AIAgent: if val is not None: normalized[passthrough_key] = val + extra_headers = api_kwargs.get("extra_headers") + if extra_headers is not None: + if not isinstance(extra_headers, dict): + raise ValueError("Codex Responses request 'extra_headers' must be an object.") + normalized_headers: Dict[str, str] = {} + for key, value in extra_headers.items(): + if not isinstance(key, str) or not key.strip(): + raise ValueError("Codex Responses request 'extra_headers' keys must be non-empty strings.") + if value is None: + continue + normalized_headers[key.strip()] = str(value) + if normalized_headers: + normalized["extra_headers"] = normalized_headers + if allow_stream: stream = api_kwargs.get("stream") if stream is not None and stream is not True: @@ -6504,7 +6524,12 @@ class AIAgent: if not is_github_responses: kwargs["prompt_cache_key"] = self.session_id - if reasoning_enabled: + is_xai_responses = self.provider == "xai" or "api.x.ai" in (self.base_url or "").lower() + + if reasoning_enabled and is_xai_responses: + # xAI reasons automatically — no effort param, just include encrypted content + kwargs["include"] = ["reasoning.encrypted_content"] + elif reasoning_enabled: if is_github_responses: # Copilot's Responses route advertises reasoning-effort support, # but not OpenAI-specific prompt cache or encrypted reasoning @@ -6515,7 +6540,7 @@ class AIAgent: else: kwargs["reasoning"] = {"effort": reasoning_effort, "summary": "auto"} kwargs["include"] = ["reasoning.encrypted_content"] - elif not is_github_responses: + elif not is_github_responses and not is_xai_responses: kwargs["include"] = [] if self.request_overrides: @@ -6524,6 +6549,9 @@ class AIAgent: if self.max_tokens is not None and not is_codex_backend: kwargs["max_output_tokens"] = self.max_tokens + if is_xai_responses and getattr(self, "session_id", None): + kwargs["extra_headers"] = {"x-grok-conv-id": self.session_id} + return kwargs sanitized_messages = api_messages @@ -6706,12 +6734,6 @@ class AIAgent: if extra_body: api_kwargs["extra_body"] = extra_body - # xAI prompt caching: send x-grok-conv-id header to route requests - # to the same server, maximizing automatic cache hits. - # https://docs.x.ai/developers/advanced-api-usage/prompt-caching - if "x.ai" in self._base_url_lower and hasattr(self, "session_id") and self.session_id: - api_kwargs["extra_headers"] = {"x-grok-conv-id": self.session_id} - # Priority Processing / generic request overrides (e.g. service_tier). # Applied last so overrides win over any defaults set above. if self.request_overrides: diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 9fdb63866..65ff725ee 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -45,6 +45,7 @@ from hermes_constants import display_hermes_home logger = logging.getLogger(__name__) from tools.managed_tool_gateway import resolve_managed_tool_gateway from tools.tool_backend_helpers import managed_nous_tools_enabled, resolve_openai_audio_api_key +from tools.xai_http import hermes_xai_user_agent # --------------------------------------------------------------------------- # Lazy imports -- providers are imported only when actually used to avoid @@ -93,6 +94,11 @@ DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady" DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2" DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603" DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral +DEFAULT_XAI_VOICE_ID = "eve" +DEFAULT_XAI_LANGUAGE = "en" +DEFAULT_XAI_SAMPLE_RATE = 24000 +DEFAULT_XAI_BIT_RATE = 128000 +DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1" def _get_default_output_dir() -> str: from hermes_constants import get_hermes_dir @@ -299,6 +305,71 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any] close() +# =========================================================================== +# Provider: xAI TTS +# =========================================================================== +def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: + """ + Generate audio using xAI TTS. + + xAI exposes a dedicated /v1/tts endpoint instead of the OpenAI audio.speech + API shape, so this is implemented as a separate backend. + """ + import requests + + api_key = os.getenv("XAI_API_KEY", "").strip() + if not api_key: + raise ValueError("XAI_API_KEY not set. Get one at https://console.x.ai/") + + xai_config = tts_config.get("xai", {}) + voice_id = str(xai_config.get("voice_id", DEFAULT_XAI_VOICE_ID)).strip() or DEFAULT_XAI_VOICE_ID + language = str(xai_config.get("language", DEFAULT_XAI_LANGUAGE)).strip() or DEFAULT_XAI_LANGUAGE + sample_rate = int(xai_config.get("sample_rate", DEFAULT_XAI_SAMPLE_RATE)) + bit_rate = int(xai_config.get("bit_rate", DEFAULT_XAI_BIT_RATE)) + base_url = str( + xai_config.get("base_url") + or os.getenv("XAI_BASE_URL") + or DEFAULT_XAI_BASE_URL + ).strip().rstrip("/") + + # Match the documented minimal POST /v1/tts shape by default. Only send + # output_format when Hermes actually needs a non-default format/override. + codec = "wav" if output_path.endswith(".wav") else "mp3" + payload: Dict[str, Any] = { + "text": text, + "voice_id": voice_id, + "language": language, + } + if ( + codec != "mp3" + or sample_rate != DEFAULT_XAI_SAMPLE_RATE + or (codec == "mp3" and bit_rate != DEFAULT_XAI_BIT_RATE) + ): + output_format: Dict[str, Any] = {"codec": codec} + if sample_rate: + output_format["sample_rate"] = sample_rate + if codec == "mp3" and bit_rate: + output_format["bit_rate"] = bit_rate + payload["output_format"] = output_format + + response = requests.post( + f"{base_url}/tts", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + "User-Agent": hermes_xai_user_agent(), + }, + json=payload, + timeout=60, + ) + response.raise_for_status() + + with open(output_path, "wb") as f: + f.write(response.content) + + return output_path + + # =========================================================================== # Provider: MiniMax TTS # =========================================================================== @@ -600,6 +671,10 @@ def text_to_speech_tool( logger.info("Generating speech with MiniMax TTS...") _generate_minimax_tts(text, file_str, tts_config) + elif provider == "xai": + logger.info("Generating speech with xAI TTS...") + _generate_xai_tts(text, file_str, tts_config) + elif provider == "mistral": try: _import_mistral_client() @@ -661,7 +736,7 @@ def text_to_speech_tool( # Try Opus conversion for Telegram compatibility # Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion voice_compatible = False - if provider in ("edge", "neutts", "minimax") and not file_str.endswith(".ogg"): + if provider in ("edge", "neutts", "minimax", "xai") and not file_str.endswith(".ogg"): opus_path = _convert_to_opus(file_str) if opus_path: file_str = opus_path @@ -734,6 +809,8 @@ def check_tts_requirements() -> bool: pass if os.getenv("MINIMAX_API_KEY"): return True + if os.getenv("XAI_API_KEY"): + return True try: _import_mistral_client() if os.getenv("MISTRAL_API_KEY"): diff --git a/tools/xai_http.py b/tools/xai_http.py new file mode 100644 index 000000000..b5bce97c2 --- /dev/null +++ b/tools/xai_http.py @@ -0,0 +1,12 @@ +"""Shared helpers for direct xAI HTTP integrations.""" + +from __future__ import annotations + + +def hermes_xai_user_agent() -> str: + """Return a stable Hermes-specific User-Agent for xAI HTTP calls.""" + try: + from hermes_cli import __version__ + except Exception: + __version__ = "unknown" + return f"Hermes-Agent/{__version__}" diff --git a/toolsets.py b/toolsets.py index 09ee8de09..b725133a6 100644 --- a/toolsets.py +++ b/toolsets.py @@ -151,7 +151,7 @@ TOOLSETS = { }, "tts": { - "description": "Text-to-speech: convert text to audio with Edge TTS (free), ElevenLabs, or OpenAI", + "description": "Text-to-speech: convert text to audio with Edge TTS (free), ElevenLabs, OpenAI, or xAI", "tools": ["text_to_speech"], "includes": [] },