hermes-agent/plugins/video_gen/xai/__init__.py
Jaaneek b62c997973 feat(xai-oauth): add xAI Grok OAuth (SuperGrok Subscription) provider
Adds a new authentication provider that lets SuperGrok subscribers sign
in to Hermes with their xAI account via the standard OAuth 2.0 PKCE
loopback flow, instead of pasting a raw API key from console.x.ai.

Highlights
----------
* OAuth 2.0 PKCE loopback login against accounts.x.ai with discovery,
  state/nonce, and a strict CORS-origin allowlist on the callback.
* Authorize URL carries `plan=generic` (required for non-allowlisted
  loopback clients) and `referrer=hermes-agent` for best-effort
  attribution in xAI's OAuth server logs.
* Token storage in `auth.json` with file-locked atomic writes; JWT
  `exp`-based expiry detection with skew; refresh-token rotation
  synced both ways between the singleton store and the credential
  pool so multi-process / multi-profile setups don't tear each other's
  refresh tokens.
* Reactive 401 retry: on a 401 from the xAI Responses API, the agent
  refreshes the token, swaps it back into `self.api_key`, and retries
  the call once. Guarded against silent account swaps when the active
  key was sourced from a different (manual) pool entry.
* Auxiliary tasks (curator, vision, embeddings, etc.) route through a
  dedicated xAI Responses-mode auxiliary client instead of falling back
  to OpenRouter billing.
* Direct HTTP tools (`tools/xai_http.py`, transcription, TTS, image-gen
  plugin) resolve credentials through a unified runtime → singleton →
  env-var fallback chain so xai-oauth users get them for free.
* `hermes auth add xai-oauth` and `hermes auth remove xai-oauth N` are
  wired through the standard auth-commands surface; remove cleans up
  the singleton loopback_pkce entry so it doesn't silently reinstate.
* `hermes model` provider picker shows
  "xAI Grok OAuth (SuperGrok Subscription)" and the model-flow falls
  back to pool credentials when the singleton is missing.

Hardening
---------
* Discovery and refresh responses validate the returned
  `token_endpoint` host against the same `*.x.ai` allowlist as the
  authorization endpoint, blocking MITM persistence of a hostile
  endpoint.
* Discovery / refresh / token-exchange `response.json()` calls are
  wrapped to raise typed `AuthError` on malformed bodies (captive
  portals, proxy error pages) instead of leaking JSONDecodeError
  tracebacks.
* `prompt_cache_key` is routed through `extra_body` on the codex
  transport (sending it as a top-level kwarg trips xAI's SDK with a
  TypeError).
* Credential-pool sync-back preserves `active_provider` so refreshing
  an OAuth entry doesn't silently flip the active provider out from
  under the running agent.

Testing
-------
* New `tests/hermes_cli/test_auth_xai_oauth_provider.py` (~63 tests)
  covers JWT expiry, OAuth URL params (plan + referrer), CORS origins,
  redirect URI validation, singleton↔pool sync, concurrency races,
  refresh error paths, runtime resolution, and malformed-JSON guards.
* Extended `test_credential_pool.py`, `test_codex_transport.py`, and
  `test_run_agent_codex_responses.py` cover the pool sync-back,
  `extra_body` routing, and 401 reactive refresh paths.
* 165 tests passing on this branch via `scripts/run_tests.sh`.
2026-05-15 12:11:32 -07:00

441 lines
15 KiB
Python

"""xAI Grok-Imagine video generation backend.
Surface: text-to-video and image-to-video (animate an input image)
through xAI's ``/videos/generations`` endpoint. Edit and extend are not
exposed in this unified surface — xAI is the only backend that supports
them and the inconsistency would force per-backend prose in the agent's
tool description.
Originally salvaged from PR #10600 by @Jaaneek; reshaped into the
:class:`VideoGenProvider` plugin interface and trimmed to the
generate-only surface.
Authentication: xAI Grok OAuth tokens (preferred — billed against the
user's SuperGrok subscription) or ``XAI_API_KEY``. Both routes are
resolved through ``tools.xai_http.resolve_xai_http_credentials`` so a
single login covers chat + TTS + image gen + video gen + transcription.
Output is an HTTPS URL from xAI's CDN; the gateway downloads and
delivers it.
"""
from __future__ import annotations
import asyncio
import logging
import os
import uuid
from typing import Any, Dict, List, Optional, Tuple
import httpx
from agent.video_gen_provider import (
VideoGenProvider,
error_response,
success_response,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
DEFAULT_MODEL = "grok-imagine-video"
DEFAULT_DURATION = 8
DEFAULT_ASPECT_RATIO = "16:9"
DEFAULT_RESOLUTION = "720p"
DEFAULT_TIMEOUT_SECONDS = 240
DEFAULT_POLL_INTERVAL_SECONDS = 5
VALID_ASPECT_RATIOS = {"1:1", "16:9", "9:16", "4:3", "3:4", "3:2", "2:3"}
VALID_RESOLUTIONS = {"480p", "720p"}
MAX_REFERENCE_IMAGES = 7
_MODELS: Dict[str, Dict[str, Any]] = {
"grok-imagine-video": {
"display": "Grok Imagine Video",
"speed": "~60-240s",
"strengths": "Text-to-video + image-to-video; up to 7 reference images for style/character.",
"price": "see https://docs.x.ai/docs/models",
"modalities": ["text", "image"],
},
}
# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------
def _resolve_xai_credentials() -> Tuple[str, str]:
"""Return ``(api_key, base_url)`` from the shared xAI credential resolver.
Order: runtime provider (xai-oauth pool entry) → singleton ``auth.json``
OAuth tokens → ``XAI_API_KEY`` env var. ``api_key`` is empty when no
credential source is available; callers must check before using it.
"""
try:
from tools.xai_http import resolve_xai_http_credentials
creds = resolve_xai_http_credentials() or {}
except Exception as exc:
logger.debug("xAI credential resolver failed: %s", exc)
creds = {}
api_key = str(creds.get("api_key") or os.getenv("XAI_API_KEY", "")).strip()
base_url = str(
creds.get("base_url")
or os.getenv("XAI_BASE_URL")
or DEFAULT_XAI_BASE_URL
).strip().rstrip("/")
return api_key, base_url
def _xai_user_agent() -> str:
try:
from tools.xai_http import hermes_xai_user_agent
return hermes_xai_user_agent()
except Exception:
return "hermes-agent/video_gen"
def _xai_headers(api_key: str) -> Dict[str, str]:
return {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"User-Agent": _xai_user_agent(),
}
def _normalize_reference_images(reference_image_urls: Optional[List[str]]):
refs = []
for url in reference_image_urls or []:
normalized = (url or "").strip()
if normalized:
refs.append({"url": normalized})
return refs or None
def _clamp_duration(duration: Optional[int], has_reference_images: bool) -> int:
value = duration if duration is not None else DEFAULT_DURATION
if value < 1:
value = 1
if value > 15:
value = 15
if has_reference_images and value > 10:
value = 10
return value
async def _submit(
client: httpx.AsyncClient,
payload: Dict[str, Any],
*,
api_key: str,
base_url: str,
) -> str:
"""POST to /videos/generations — xAI's only public endpoint for our
text-to-video and image-to-video surface."""
response = await client.post(
f"{base_url}/videos/generations",
headers={**_xai_headers(api_key), "x-idempotency-key": str(uuid.uuid4())},
json=payload,
timeout=60,
)
response.raise_for_status()
body = response.json()
request_id = body.get("request_id")
if not request_id:
raise RuntimeError("xAI video response did not include request_id")
return request_id
async def _poll(
client: httpx.AsyncClient,
request_id: str,
*,
api_key: str,
base_url: str,
timeout_seconds: int,
poll_interval: int,
) -> Dict[str, Any]:
elapsed = 0.0
last_status = "queued"
while elapsed < timeout_seconds:
response = await client.get(
f"{base_url}/videos/{request_id}",
headers=_xai_headers(api_key),
timeout=30,
)
response.raise_for_status()
body = response.json()
last_status = (body.get("status") or "").lower()
if last_status == "done":
return {"status": "done", "body": body}
if last_status in {"failed", "error", "expired", "cancelled"}:
return {"status": last_status, "body": body}
await asyncio.sleep(poll_interval)
elapsed += poll_interval
return {"status": "timeout", "body": {"status": last_status}}
# ---------------------------------------------------------------------------
# Provider
# ---------------------------------------------------------------------------
class XAIVideoGenProvider(VideoGenProvider):
"""xAI grok-imagine-video backend (text-to-video + image-to-video)."""
@property
def name(self) -> str:
return "xai"
@property
def display_name(self) -> str:
return "xAI"
def is_available(self) -> bool:
api_key, _ = _resolve_xai_credentials()
return bool(api_key)
def list_models(self) -> List[Dict[str, Any]]:
return [{"id": mid, **meta} for mid, meta in _MODELS.items()]
def default_model(self) -> Optional[str]:
return DEFAULT_MODEL
def get_setup_schema(self) -> Dict[str, Any]:
# Auth resolution lives entirely in the shared ``xai_grok`` post_setup
# hook (``hermes_cli/tools_config.py``) so the picker doesn't blindly
# prompt for an API key when the user is already signed in via xAI
# Grok OAuth (SuperGrok Subscription) — TTS / image gen / video gen
# all share the same credential resolver. The hook offers an
# OAuth-vs-API-key choice when neither is configured.
return {
"name": "xAI Grok Imagine",
"badge": "paid",
"tag": "grok-imagine-video — text-to-video & image-to-video; uses xAI Grok OAuth or XAI_API_KEY",
"env_vars": [],
"post_setup": "xai_grok",
}
def capabilities(self) -> Dict[str, Any]:
return {
"modalities": ["text", "image"],
"aspect_ratios": sorted(VALID_ASPECT_RATIOS),
"resolutions": sorted(VALID_RESOLUTIONS),
"max_duration": 15,
"min_duration": 1,
"supports_audio": False,
"supports_negative_prompt": False,
"max_reference_images": MAX_REFERENCE_IMAGES,
}
def generate(
self,
prompt: str,
*,
model: Optional[str] = None,
image_url: Optional[str] = None,
reference_image_urls: Optional[List[str]] = None,
duration: Optional[int] = None,
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
resolution: str = DEFAULT_RESOLUTION,
negative_prompt: Optional[str] = None,
audio: Optional[bool] = None,
seed: Optional[int] = None,
**kwargs: Any,
) -> Dict[str, Any]:
try:
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(self._generate_async(
prompt=prompt,
model=model,
image_url=image_url,
reference_image_urls=reference_image_urls,
duration=duration,
aspect_ratio=aspect_ratio,
resolution=resolution,
))
finally:
loop.close()
except Exception as exc:
logger.warning("xAI video gen unexpected failure: %s", exc, exc_info=True)
return error_response(
error=f"xAI video generation failed: {exc}",
error_type="api_error",
provider="xai",
model=model or DEFAULT_MODEL,
prompt=prompt,
aspect_ratio=aspect_ratio,
)
async def _generate_async(
self,
*,
prompt: str,
model: Optional[str],
image_url: Optional[str],
reference_image_urls: Optional[List[str]],
duration: Optional[int],
aspect_ratio: str,
resolution: str,
) -> Dict[str, Any]:
api_key, base_url = _resolve_xai_credentials()
if not api_key:
return error_response(
error=(
"No xAI credentials found. Sign in via `hermes auth add xai-oauth` "
"(SuperGrok subscription) or set XAI_API_KEY from "
"https://console.x.ai/."
),
error_type="auth_required",
provider="xai", prompt=prompt,
)
prompt = (prompt or "").strip()
image_url_norm = (image_url or "").strip() or None
normalized_aspect_ratio = (aspect_ratio or DEFAULT_ASPECT_RATIO).strip()
normalized_resolution = (resolution or DEFAULT_RESOLUTION).strip().lower()
modality_used = "image" if image_url_norm else "text"
if not prompt:
return error_response(
error=(
"prompt is required for xAI video generation "
"(text-to-video or image-to-video)"
),
error_type="missing_prompt",
provider="xai", prompt=prompt,
)
refs = _normalize_reference_images(reference_image_urls)
if refs and len(refs) > MAX_REFERENCE_IMAGES:
return error_response(
error=f"reference_image_urls supports at most {MAX_REFERENCE_IMAGES} images on xAI",
error_type="too_many_references",
provider="xai", prompt=prompt,
)
if image_url_norm and refs:
return error_response(
error="image_url and reference_image_urls cannot be combined on xAI",
error_type="conflicting_inputs",
provider="xai", prompt=prompt,
)
clamped_duration = _clamp_duration(duration, has_reference_images=bool(refs))
if normalized_aspect_ratio not in VALID_ASPECT_RATIOS:
normalized_aspect_ratio = DEFAULT_ASPECT_RATIO
if normalized_resolution not in VALID_RESOLUTIONS:
normalized_resolution = DEFAULT_RESOLUTION
payload: Dict[str, Any] = {
"model": model or DEFAULT_MODEL,
"prompt": prompt,
"duration": clamped_duration,
"aspect_ratio": normalized_aspect_ratio,
"resolution": normalized_resolution,
}
if image_url_norm:
payload["image"] = {"url": image_url_norm}
if refs:
payload["reference_images"] = refs
async with httpx.AsyncClient() as client:
try:
request_id = await _submit(
client, payload, api_key=api_key, base_url=base_url
)
except httpx.HTTPStatusError as exc:
detail = ""
try:
detail = exc.response.text[:500]
except Exception:
pass
return error_response(
error=f"xAI submit failed ({exc.response.status_code}): {detail or exc}",
error_type="api_error",
provider="xai",
model=model or DEFAULT_MODEL,
prompt=prompt,
)
poll_result = await _poll(
client, request_id,
api_key=api_key, base_url=base_url,
timeout_seconds=DEFAULT_TIMEOUT_SECONDS,
poll_interval=DEFAULT_POLL_INTERVAL_SECONDS,
)
status = poll_result["status"]
body = poll_result["body"]
if status == "done":
video = body.get("video") or {}
url = video.get("url")
if not url:
return error_response(
error="xAI video generation completed without a video URL",
error_type="empty_response",
provider="xai",
model=body.get("model") or model or DEFAULT_MODEL,
prompt=prompt,
)
extra: Dict[str, Any] = {
"request_id": request_id,
"resolution": normalized_resolution,
}
if body.get("usage"):
extra["usage"] = body["usage"]
return success_response(
video=url,
model=body.get("model") or model or DEFAULT_MODEL,
prompt=prompt,
modality=modality_used,
aspect_ratio=normalized_aspect_ratio,
duration=video.get("duration") or clamped_duration,
provider="xai",
extra=extra,
)
if status == "timeout":
return error_response(
error=f"Timed out waiting for video generation after {DEFAULT_TIMEOUT_SECONDS}s",
error_type="timeout",
provider="xai",
model=model or DEFAULT_MODEL,
prompt=prompt,
)
message = (
(body.get("error", {}) or {}).get("message")
or body.get("message")
or f"xAI video generation ended with status '{status}'"
)
return error_response(
error=message,
error_type=f"xai_{status}",
provider="xai",
model=model or DEFAULT_MODEL,
prompt=prompt,
)
# ---------------------------------------------------------------------------
# Plugin entry point
# ---------------------------------------------------------------------------
def register(ctx) -> None:
"""Plugin entry point — wire ``XAIVideoGenProvider`` into the registry."""
ctx.register_video_gen_provider(XAIVideoGenProvider())