feat(xai): Imagine public-URL storage, chaining & video edit/extend

Add durable public-URL output and URL-based chaining to xAI Grok Imagine:

- Store generated media on files-cdn with permanent public HTTPS URLs
  (public_url: true, no expiry by default).
- Chain by URL: generate -> edit -> extend each take a prior result's
  public HTTPS URL (or a data URI / local file for inputs).
- Add provider-specific xai_video_edit and xai_video_extend tools.
- Image generation: public-URL/storage output, multi-reference edits,
  and ~/ local-path support for image edits.

Credentials use xAI Grok device-code OAuth (separate PR).
This commit is contained in:
Jaaneek 2026-06-29 18:18:30 +00:00 committed by Teknium
parent 184c10cf97
commit 9ce79cd642
15 changed files with 1694 additions and 294 deletions

View file

@ -19,6 +19,7 @@ from __future__ import annotations
import logging
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import requests
@ -33,7 +34,14 @@ from agent.image_gen_provider import (
save_url_image,
success_response,
)
from tools.xai_http import hermes_xai_user_agent, resolve_xai_http_credentials
from tools.xai_http import (
build_xai_storage_options,
hermes_xai_user_agent,
maybe_mark_xai_storage_notice_seen,
read_xai_imagine_storage_config,
resolve_xai_http_credentials,
xai_storage_notice_text,
)
logger = logging.getLogger(__name__)
@ -118,10 +126,8 @@ def _resolve_resolution() -> str:
def _xai_image_field(source: str) -> Dict[str, str]:
"""Build the xAI ``image`` field for an edit request.
xAI's ``/v1/images/edits`` accepts ``{"url": <ref>, "type": "image_url"}``
where ``<ref>`` is a public URL or a base64 data URI. Public URLs and
existing data URIs pass through unchanged; local file paths are read and
encoded into a ``data:`` URI.
xAI's ``/v1/images/edits`` accepts a public HTTPS URL or a base64 data URI.
Local file paths are read and encoded into a ``data:`` URI.
"""
source = source.strip()
lower = source.lower()
@ -131,7 +137,7 @@ def _xai_image_field(source: str) -> Dict[str, str]:
import base64
import os as _os
with open(source, "rb") as fh:
with open(_os.path.expanduser(source), "rb") as fh:
raw = fh.read()
ext = (_os.path.splitext(source)[1].lstrip(".") or "png").lower()
if ext == "jpg":
@ -176,19 +182,29 @@ class XAIImageGenProvider(ImageGenProvider):
# hook (``hermes_cli/tools_config.py``); identical to the TTS / video
# gen entries so users see the same OAuth-or-API-key choice for every
# xAI service.
storage_notice = xai_storage_notice_text("image_gen")
tag = (
"grok-imagine-image - text-to-image & image editing; uses xAI "
"Grok OAuth or XAI_API_KEY"
)
if storage_notice:
tag += f". {storage_notice}"
return {
"name": "xAI Grok Imagine (image)",
"badge": "paid",
"tag": "grok-imagine-image — text-to-image & image editing; uses xAI Grok OAuth or XAI_API_KEY",
"tag": tag,
"env_vars": [],
"post_setup": "xai_grok",
}
def capabilities(self) -> Dict[str, Any]:
# xAI's /v1/images/edits supports image editing via grok-imagine-image
# -quality. Single primary source image (multi-image editing exists as
# a separate capability but we keep the primary edit surface here).
return {"modalities": ["text", "image"], "max_reference_images": 1}
# -quality, including up to 3 total source images.
return {
"modalities": ["text", "image"],
"max_reference_images": 2,
"max_source_images": 3,
}
def generate(
self,
@ -224,16 +240,39 @@ class XAIImageGenProvider(ImageGenProvider):
resolution = _resolve_resolution()
xai_res = resolution if resolution in _XAI_RESOLUTIONS else DEFAULT_RESOLUTION
# Pick the primary source image: explicit image_url wins, else the
# first reference image.
source_image = None
source_images: List[str] = []
if isinstance(image_url, str) and image_url.strip():
source_image = image_url.strip()
else:
refs = normalize_reference_images(reference_image_urls)
if refs:
source_image = refs[0]
is_edit = bool(source_image)
source_images.append(image_url.strip())
refs = normalize_reference_images(reference_image_urls)
if refs:
source_images.extend(refs)
if len(source_images) > 3:
return error_response(
error="xAI image editing supports at most 3 source images",
error_type="too_many_references",
provider=provider_name,
model="grok-imagine-image-quality",
prompt=prompt,
aspect_ratio=aspect,
)
for index, source in enumerate(source_images):
field = "image_url" if index == 0 and image_url and image_url.strip() == source else "reference_image_urls"
lower = source.lower()
if not lower.startswith(("http://", "https://", "data:")):
path = Path(source).expanduser()
if not path.is_file():
return error_response(
error=(
f"{field} must be a public HTTPS URL or data URI "
"(e.g. the `image`/`public_url` from a prior Imagine result)"
),
error_type="invalid_image_url",
provider=provider_name,
model="grok-imagine-image-quality",
prompt=prompt,
aspect_ratio=aspect,
)
is_edit = bool(source_images)
modality = "image" if is_edit else "text"
headers = {
@ -243,6 +282,13 @@ class XAIImageGenProvider(ImageGenProvider):
}
base_url = str(creds.get("base_url") or "https://api.x.ai/v1").strip().rstrip("/")
storage_options = build_xai_storage_options(
"image_gen",
filename_prefix="hermes-xai-image",
extension="png",
)
storage_notice = maybe_mark_xai_storage_notice_seen("image_gen")
storage_cfg = read_xai_imagine_storage_config("image_gen")
if is_edit:
# Editing requires the quality model per xAI docs. The source
@ -250,7 +296,7 @@ class XAIImageGenProvider(ImageGenProvider):
# are converted to a data URI here.
edit_model = "grok-imagine-image-quality"
try:
image_field = _xai_image_field(source_image)
image_fields = [_xai_image_field(source) for source in source_images]
except Exception as exc:
return error_response(
error=f"Could not load source image for editing: {exc}",
@ -263,8 +309,11 @@ class XAIImageGenProvider(ImageGenProvider):
payload: Dict[str, Any] = {
"model": edit_model,
"prompt": prompt,
"image": image_field,
}
if len(image_fields) == 1:
payload["image"] = image_fields[0]
else:
payload["images"] = image_fields
endpoint_url = f"{base_url}/images/edits"
model_id = edit_model
else:
@ -275,6 +324,8 @@ class XAIImageGenProvider(ImageGenProvider):
"resolution": xai_res,
}
endpoint_url = f"{base_url}/images/generations"
if storage_options is not None:
payload["storage_options"] = storage_options
try:
response = requests.post(
@ -331,7 +382,8 @@ class XAIImageGenProvider(ImageGenProvider):
aspect_ratio=aspect,
)
# Parse response — xAI returns data[0].b64_json or data[0].url
# Parse response - xAI returns data[0].b64_json, data[0].url, and
# optionally data[0].file_output when storage_options were requested.
data = result.get("data", [])
if not data:
return error_response(
@ -346,8 +398,13 @@ class XAIImageGenProvider(ImageGenProvider):
first = data[0]
b64 = first.get("b64_json")
url = first.get("url")
file_output = first.get("file_output") if isinstance(first, dict) else None
file_output = file_output if isinstance(file_output, dict) else {}
public_url = file_output.get("public_url") if isinstance(file_output.get("public_url"), str) else None
if b64:
if public_url:
image_ref = public_url
elif b64:
try:
saved_path = save_b64_image(b64, prefix=f"xai_{model_id}")
except Exception as exc:
@ -389,9 +446,27 @@ class XAIImageGenProvider(ImageGenProvider):
aspect_ratio=aspect,
)
extra: Dict[str, Any] = {}
extra: Dict[str, Any] = {
"storage_enabled": bool(storage_cfg["enabled"]),
}
if not is_edit:
extra["resolution"] = xai_res
if storage_notice:
extra["storage_notice"] = storage_notice
if public_url:
extra["public_url"] = public_url
if file_output:
for key in (
"filename",
"expires_at",
"public_url_expires_at",
"public_url_error",
"storage_error",
):
if key in file_output:
extra[key] = file_output[key]
if result.get("usage"):
extra["usage"] = result["usage"]
return success_response(
image=image_ref,

View file

@ -1,10 +1,7 @@
"""xAI Grok-Imagine video generation backend.
Surface: text-to-video and image-to-video (animate an input image)
through xAI's ``/videos/generations`` endpoint. Edit and extend are not
exposed in this unified surface xAI is the only backend that supports
them and the inconsistency would force per-backend prose in the agent's
tool description.
Surface: text-to-video, image-to-video, and reference-to-video through the
unified video provider. xAI edit/extend are exposed through separate tools.
Originally salvaged from PR #10600 by @Jaaneek; reshaped into the
:class:`VideoGenProvider` plugin interface and trimmed to the
@ -14,8 +11,9 @@ Authentication: xAI Grok OAuth tokens (preferred — billed against the
user's SuperGrok or X Premium+ subscription) or ``XAI_API_KEY``. Both routes are
resolved through ``tools.xai_http.resolve_xai_http_credentials`` so a
single login covers chat + TTS + image gen + video gen + transcription.
Output is an HTTPS URL from xAI's CDN; the gateway downloads and
delivers it.
When xAI storage is enabled, the primary ``video`` / ``public_url`` fields are the
stored files-cdn HTTPS link. Pass that public MP4 URL as ``video_url`` for
edit/extend; it is sent to xAI as ``video.url``.
"""
from __future__ import annotations
@ -46,13 +44,14 @@ logger = logging.getLogger(__name__)
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
DEFAULT_TEXT_TO_VIDEO_MODEL = "grok-imagine-video"
DEFAULT_IMAGE_TO_VIDEO_MODEL = "grok-imagine-video-1.5-preview"
DEFAULT_IMAGE_TO_VIDEO_MODEL = "grok-imagine-video-1.5"
DEFAULT_MODEL = DEFAULT_TEXT_TO_VIDEO_MODEL
DEFAULT_DURATION = 8
DEFAULT_ASPECT_RATIO = "16:9"
DEFAULT_RESOLUTION = "720p"
DEFAULT_TIMEOUT_SECONDS = 240
DEFAULT_POLL_INTERVAL_SECONDS = 5
DEFAULT_EXTEND_DURATION = 6
VALID_ASPECT_RATIOS = {"1:1", "16:9", "9:16", "4:3", "3:4", "3:2", "2:3"}
VALID_RESOLUTIONS = {"480p", "720p"}
@ -67,16 +66,20 @@ _MODELS: Dict[str, Dict[str, Any]] = {
"price": "see https://docs.x.ai/developers/models/grok-imagine-video",
"modalities": ["text", "image"],
},
"grok-imagine-video-1.5-preview": {
"display": "Grok Imagine Video 1.5 Preview",
"grok-imagine-video-1.5": {
"display": "Grok Imagine Video 1.5",
"speed": "~60-240s",
"strengths": "Latest xAI image-to-video model.",
"price": "see https://docs.x.ai/developers/models/grok-imagine-video-1.5-preview",
"price": "see https://docs.x.ai/developers/pricing",
"modalities": ["image"],
"aliases": ["grok-imagine-video-1.5-2026-05-30"],
},
}
_IMAGE_TO_VIDEO_COMPAT_MODEL_IDS = {
"grok-imagine-video-1.5-preview",
"grok-imagine-video-1.5-2026-05-30",
}
# ---------------------------------------------------------------------------
# HTTP helpers
@ -145,21 +148,114 @@ def _image_ref_to_xai_url(value: str) -> str:
return f"data:{mime};base64,{encoded}"
def _normalize_reference_images(reference_image_urls: Optional[List[str]]):
refs = []
def _image_ref_to_xai_input(value: str) -> Optional[Dict[str, str]]:
ref = _image_ref_to_xai_url(value)
if not ref:
return None
lower = ref.lower()
if lower.startswith(("http://", "https://", "data:image/")):
return {"url": ref}
return None
def _xai_video_output_urls(
video: Dict[str, Any],
) -> Tuple[str, Optional[str], Optional[str]]:
"""Return ``(public_video_url, temporary_url, stored_public_url)``.
``public_video_url`` is the stored files-cdn HTTPS MP4 (``public_url``) when
storage is enabled; otherwise xAI's temporary ``video.url``. Pass this value
as ``video_url`` for edit/extend chaining.
"""
file_output = video.get("file_output") if isinstance(video.get("file_output"), dict) else {}
file_output = file_output or {}
stored_public = file_output.get("public_url")
stored_public = stored_public.strip() if isinstance(stored_public, str) else None
temporary = video.get("url")
temporary = temporary.strip() if isinstance(temporary, str) else None
public_video_url = stored_public or temporary or ""
temporary_out = (
temporary
if temporary and stored_public and temporary != stored_public
else None
)
return public_video_url, temporary_out, stored_public
def _video_ref_to_xai_url(value: str) -> str:
"""Return a URL/data URI accepted by xAI for video inputs."""
ref = (value or "").strip()
if not ref:
return ""
lower = ref.lower()
if lower.startswith(("http://", "https://", "data:video/")):
return ref
path = Path(ref).expanduser()
if not path.is_file():
return ref
mime = mimetypes.guess_type(path.name)[0] or "video/mp4"
if not mime.startswith("video/"):
return ref
encoded = base64.b64encode(path.read_bytes()).decode("ascii")
return f"data:{mime};base64,{encoded}"
async def _video_input_from_public_url(
value: str,
*,
api_key: str,
base_url: str,
) -> Optional[Dict[str, str]]:
"""Build xAI ``video`` input using a public HTTPS URL (``url`` field only)."""
ref = (value or "").strip()
if not ref:
return None
path = Path(ref).expanduser()
if path.is_file():
data_ref = _video_ref_to_xai_url(ref)
return {"url": data_ref} if data_ref else None
lower = ref.lower()
if not lower.startswith(("http://", "https://")):
return None
return {"url": ref}
def _normalize_reference_images(
reference_image_urls: Optional[List[str]],
) -> Tuple[Optional[List[Dict[str, str]]], Optional[str]]:
refs: List[Dict[str, str]] = []
for url in reference_image_urls or []:
normalized = _image_ref_to_xai_url(url)
if normalized:
refs.append({"url": normalized})
return refs or None
cleaned = (url or "").strip()
if not cleaned:
continue
normalized = _image_ref_to_xai_input(cleaned)
if not normalized:
return None, (
"reference_image_urls must be public HTTPS URLs or data URIs "
"(e.g. the `image`/`public_url` from a prior Imagine result)"
)
refs.append(normalized)
return (refs if refs else None), None
def _clamp_duration(duration: Optional[int], has_reference_images: bool) -> int:
value = duration if duration is not None else DEFAULT_DURATION
def _clamp_duration(
duration: Optional[int],
*,
has_reference_images: bool = False,
max_seconds: int = 15,
default: int = DEFAULT_DURATION,
) -> int:
value = duration if duration is not None else default
if value < 1:
value = 1
if value > 15:
value = 15
if value > max_seconds:
value = max_seconds
if has_reference_images and value > 10:
value = 10
return value
@ -173,7 +269,7 @@ def _resolve_model_for_modality(
) -> str:
"""Select xAI's text/video model without treating config as a prompt override.
``grok-imagine-video-1.5-preview`` currently rejects text-only video
``grok-imagine-video-1.5`` currently rejects text-only video
generation, but it is the desired image-to-video backend. Explicit tool
``model=`` still wins for users who intentionally request another model.
"""
@ -182,7 +278,7 @@ def _resolve_model_for_modality(
return requested
if modality == "image":
return DEFAULT_IMAGE_TO_VIDEO_MODEL
if requested == DEFAULT_IMAGE_TO_VIDEO_MODEL:
if requested == DEFAULT_IMAGE_TO_VIDEO_MODEL or requested in _IMAGE_TO_VIDEO_COMPAT_MODEL_IDS:
return DEFAULT_TEXT_TO_VIDEO_MODEL
return requested or DEFAULT_TEXT_TO_VIDEO_MODEL
@ -193,11 +289,11 @@ async def _submit(
*,
api_key: str,
base_url: str,
endpoint: str = "generations",
) -> str:
"""POST to /videos/generations — xAI's only public endpoint for our
text-to-video and image-to-video surface."""
"""POST to one of xAI's async video endpoints and return request_id."""
response = await client.post(
f"{base_url}/videos/generations",
f"{base_url}/videos/{endpoint}",
headers={**_xai_headers(api_key), "x-idempotency-key": str(uuid.uuid4())},
json=payload,
timeout=60,
@ -248,7 +344,7 @@ async def _poll(
class XAIVideoGenProvider(VideoGenProvider):
"""xAI Grok Imagine video backend (text-to-video + image-to-video)."""
"""xAI Grok Imagine video backend."""
@property
def name(self) -> str:
@ -275,10 +371,25 @@ class XAIVideoGenProvider(VideoGenProvider):
# Grok OAuth (SuperGrok / Premium+) — TTS / image gen / video gen
# all share the same credential resolver. The hook offers an
# OAuth-vs-API-key choice when neither is configured.
try:
from tools.xai_http import xai_storage_notice_text
storage_notice = xai_storage_notice_text("video_gen")
except Exception:
storage_notice = ""
tag = (
"grok-imagine-video for text/reference; "
"grok-imagine-video-1.5 for image-to-video; "
"edit/extend: pass the stored public HTTPS MP4 (`video` / "
"`public_url` from a prior Imagine result); uses xAI Grok OAuth "
"or XAI_API_KEY"
)
if storage_notice:
tag += f". {storage_notice}"
return {
"name": "xAI Grok Imagine",
"badge": "paid",
"tag": "grok-imagine-video for text-to-video; grok-imagine-video-1.5-preview for image-to-video; uses xAI Grok OAuth or XAI_API_KEY",
"tag": tag,
"env_vars": [],
"post_setup": "xai_grok",
}
@ -310,189 +421,479 @@ class XAIVideoGenProvider(VideoGenProvider):
seed: Optional[int] = None,
**kwargs: Any,
) -> Dict[str, Any]:
try:
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(self._generate_async(
prompt=prompt,
model=model,
explicit_model=bool(kwargs.get("_model_override_explicit")),
image_url=image_url,
reference_image_urls=reference_image_urls,
duration=duration,
aspect_ratio=aspect_ratio,
resolution=resolution,
))
finally:
loop.close()
except Exception as exc:
logger.warning("xAI video gen unexpected failure: %s", exc, exc_info=True)
return error_response(
error=f"xAI video generation failed: {exc}",
error_type="api_error",
provider="xai",
model=model or DEFAULT_MODEL,
prompt=prompt,
aspect_ratio=aspect_ratio,
)
async def _generate_async(
self,
*,
prompt: str,
model: Optional[str],
explicit_model: bool,
image_url: Optional[str],
reference_image_urls: Optional[List[str]],
duration: Optional[int],
aspect_ratio: str,
resolution: str,
) -> Dict[str, Any]:
api_key, base_url = _resolve_xai_credentials()
if not api_key:
return error_response(
error=(
"No xAI credentials found. Sign in via `hermes auth add xai-oauth` "
"(SuperGrok / Premium+) or set XAI_API_KEY from "
"https://console.x.ai/."
),
error_type="auth_required",
provider="xai", prompt=prompt,
)
prompt = (prompt or "").strip()
image_url_norm = _image_ref_to_xai_url(image_url or "") or None
normalized_aspect_ratio = (aspect_ratio or DEFAULT_ASPECT_RATIO).strip()
normalized_resolution = (resolution or DEFAULT_RESOLUTION).strip().lower()
modality_used = "image" if image_url_norm else "text"
resolved_model = _resolve_model_for_modality(
model,
modality=modality_used,
explicit_model=explicit_model,
return run_xai_video_generation(
prompt=prompt,
model=model,
explicit_model=bool(kwargs.get("_model_override_explicit")),
image_url=image_url,
reference_image_urls=reference_image_urls,
duration=duration,
aspect_ratio=aspect_ratio,
resolution=resolution,
)
if not prompt:
def has_xai_video_credentials() -> bool:
api_key, _ = _resolve_xai_credentials()
return bool(api_key)
def run_xai_video_generation(
*,
prompt: str,
model: Optional[str],
explicit_model: bool,
image_url: Optional[str],
reference_image_urls: Optional[List[str]],
duration: Optional[int],
aspect_ratio: str,
resolution: str,
) -> Dict[str, Any]:
return _run_xai_video_coroutine(
_generate_xai_video_async(
prompt=prompt,
model=model,
explicit_model=explicit_model,
image_url=image_url,
reference_image_urls=reference_image_urls,
duration=duration,
aspect_ratio=aspect_ratio,
resolution=resolution,
),
operation_label="generation",
model=model,
prompt=prompt,
aspect_ratio=aspect_ratio,
)
def run_xai_video_edit(
*,
prompt: str,
video_url: str,
model: Optional[str] = None,
) -> Dict[str, Any]:
return _run_xai_video_coroutine(
_edit_xai_video_async(prompt=prompt, video_url=video_url, model=model),
operation_label="edit",
model=model,
prompt=prompt,
aspect_ratio=DEFAULT_ASPECT_RATIO,
)
def run_xai_video_extend(
*,
prompt: str,
video_url: str,
duration: Optional[int] = None,
model: Optional[str] = None,
) -> Dict[str, Any]:
return _run_xai_video_coroutine(
_extend_xai_video_async(
prompt=prompt,
video_url=video_url,
duration=duration,
model=model,
),
operation_label="extend",
model=model,
prompt=prompt,
aspect_ratio=DEFAULT_ASPECT_RATIO,
)
def _run_xai_video_coroutine(
coro,
*,
operation_label: str,
model: Optional[str],
prompt: str,
aspect_ratio: str,
) -> Dict[str, Any]:
try:
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
finally:
loop.close()
except Exception as exc:
logger.warning("xAI video %s unexpected failure: %s", operation_label, exc, exc_info=True)
return error_response(
error=f"xAI video {operation_label} failed: {exc}",
error_type="api_error",
provider="xai",
model=model or DEFAULT_MODEL,
prompt=prompt,
aspect_ratio=aspect_ratio,
)
async def _generate_xai_video_async(
*,
prompt: str,
model: Optional[str],
explicit_model: bool,
image_url: Optional[str],
reference_image_urls: Optional[List[str]],
duration: Optional[int],
aspect_ratio: str,
resolution: str,
) -> Dict[str, Any]:
api_key, base_url = _resolve_xai_credentials()
if not api_key:
return _auth_required_response(prompt)
prompt = (prompt or "").strip()
image_input = None
if (image_url or "").strip():
image_input = _image_ref_to_xai_input(image_url)
if not image_input:
return error_response(
error=(
"prompt is required for xAI video generation "
"(text-to-video or image-to-video)"
"image_url must be a public HTTPS URL or data URI "
"(e.g. the `image`/`public_url` from a prior Imagine result)"
),
error_type="missing_prompt",
provider="xai", prompt=prompt,
)
refs = _normalize_reference_images(reference_image_urls)
if refs and len(refs) > MAX_REFERENCE_IMAGES:
return error_response(
error=f"reference_image_urls supports at most {MAX_REFERENCE_IMAGES} images on xAI",
error_type="too_many_references",
provider="xai", prompt=prompt,
)
if image_url_norm and refs:
return error_response(
error="image_url and reference_image_urls cannot be combined on xAI",
error_type="conflicting_inputs",
provider="xai", prompt=prompt,
)
clamped_duration = _clamp_duration(duration, has_reference_images=bool(refs))
if normalized_aspect_ratio not in VALID_ASPECT_RATIOS:
normalized_aspect_ratio = DEFAULT_ASPECT_RATIO
if normalized_resolution not in VALID_RESOLUTIONS:
normalized_resolution = DEFAULT_RESOLUTION
payload: Dict[str, Any] = {
"model": resolved_model,
"prompt": prompt,
"duration": clamped_duration,
"aspect_ratio": normalized_aspect_ratio,
"resolution": normalized_resolution,
}
if image_url_norm:
payload["image"] = {"url": image_url_norm}
if refs:
payload["reference_images"] = refs
async with httpx.AsyncClient() as client:
try:
request_id = await _submit(
client, payload, api_key=api_key, base_url=base_url
)
except httpx.HTTPStatusError as exc:
detail = ""
try:
detail = exc.response.text[:500]
except Exception:
pass
return error_response(
error=f"xAI submit failed ({exc.response.status_code}): {detail or exc}",
error_type="api_error",
provider="xai",
model=resolved_model,
prompt=prompt,
)
poll_result = await _poll(
client, request_id,
api_key=api_key, base_url=base_url,
timeout_seconds=DEFAULT_TIMEOUT_SECONDS,
poll_interval=DEFAULT_POLL_INTERVAL_SECONDS,
)
status = poll_result["status"]
body = poll_result["body"]
if status == "done":
video = body.get("video") or {}
url = video.get("url")
if not url:
return error_response(
error="xAI video generation completed without a video URL",
error_type="empty_response",
provider="xai",
model=body.get("model") or resolved_model,
prompt=prompt,
)
extra: Dict[str, Any] = {
"request_id": request_id,
"resolution": normalized_resolution,
}
if body.get("usage"):
extra["usage"] = body["usage"]
return success_response(
video=url,
model=body.get("model") or resolved_model,
prompt=prompt,
modality=modality_used,
aspect_ratio=normalized_aspect_ratio,
duration=video.get("duration") or clamped_duration,
error_type="invalid_image_url",
provider="xai",
extra=extra,
prompt=prompt,
)
normalized_aspect_ratio = (aspect_ratio or DEFAULT_ASPECT_RATIO).strip()
normalized_resolution = (resolution or DEFAULT_RESOLUTION).strip().lower()
refs, refs_error = _normalize_reference_images(reference_image_urls)
if refs_error:
return error_response(
error=refs_error,
error_type="invalid_reference_image_urls",
provider="xai",
prompt=prompt,
)
if status == "timeout":
if not prompt:
return error_response(
error="prompt is required for xAI video generation",
error_type="missing_prompt",
provider="xai", prompt=prompt,
)
if refs and len(refs) > MAX_REFERENCE_IMAGES:
return error_response(
error=f"reference_image_urls supports at most {MAX_REFERENCE_IMAGES} images on xAI",
error_type="too_many_references",
provider="xai", prompt=prompt,
)
if image_input and refs:
return error_response(
error="image_url and reference_image_urls cannot be combined on xAI",
error_type="conflicting_inputs",
provider="xai", prompt=prompt,
)
if normalized_aspect_ratio not in VALID_ASPECT_RATIOS:
normalized_aspect_ratio = DEFAULT_ASPECT_RATIO
if normalized_resolution not in VALID_RESOLUTIONS:
normalized_resolution = DEFAULT_RESOLUTION
modality_used = "reference" if refs else ("image" if image_input else "text")
resolved_model = _resolve_model_for_modality(
model,
modality=modality_used,
explicit_model=explicit_model,
)
if refs and resolved_model != DEFAULT_TEXT_TO_VIDEO_MODEL:
if explicit_model:
return error_response(
error=f"Timed out waiting for video generation after {DEFAULT_TIMEOUT_SECONDS}s",
error_type="timeout",
error=(
"xAI reference-to-video requires "
f"{DEFAULT_TEXT_TO_VIDEO_MODEL}; got {resolved_model}"
),
error_type="unsupported_model",
provider="xai",
model=resolved_model,
prompt=prompt,
)
resolved_model = DEFAULT_TEXT_TO_VIDEO_MODEL
clamped_duration = _clamp_duration(duration, has_reference_images=bool(refs))
payload = {
"model": resolved_model,
"prompt": prompt,
"duration": clamped_duration,
"aspect_ratio": normalized_aspect_ratio,
"resolution": normalized_resolution,
}
if image_input:
payload["image"] = image_input
if refs:
payload["reference_images"] = refs
return await _submit_xai_video_payload(
api_key=api_key,
base_url=base_url,
endpoint="generations",
payload=payload,
prompt=prompt,
resolved_model=resolved_model,
modality=modality_used,
aspect_ratio=normalized_aspect_ratio,
duration=clamped_duration,
operation="generate",
resolution=normalized_resolution,
)
async def _run_xai_video_mutation(
*,
prompt: str,
video_url: str,
model: Optional[str],
endpoint: str,
operation: str,
duration: int,
) -> Dict[str, Any]:
"""Edit or extend using a public HTTPS ``video_url`` input (``url`` on the wire)."""
api_key, base_url = _resolve_xai_credentials()
if not api_key:
return _auth_required_response(prompt)
prompt = (prompt or "").strip()
video_input = await _video_input_from_public_url(
video_url or "",
api_key=api_key,
base_url=base_url,
)
if not prompt:
return error_response(
error="prompt is required for xAI video edit/extend",
error_type="missing_prompt",
provider="xai",
prompt=prompt,
)
if not video_input:
return error_response(
error=(
"video_url must be a public HTTPS MP4 URL "
"(the `video`/`public_url` from a prior Imagine result)"
),
error_type="missing_video",
provider="xai",
prompt=prompt,
)
resolved_model = _resolve_model_for_modality(
model,
modality="text",
explicit_model=bool(model),
)
payload: Dict[str, Any] = {
"model": resolved_model,
"prompt": prompt,
"video": video_input,
}
if endpoint == "extensions":
payload["duration"] = duration
return await _submit_xai_video_payload(
api_key=api_key,
base_url=base_url,
endpoint=endpoint,
payload=payload,
prompt=prompt,
resolved_model=resolved_model,
modality=operation,
aspect_ratio=DEFAULT_ASPECT_RATIO,
duration=duration,
operation=operation,
)
async def _edit_xai_video_async(
*,
prompt: str,
video_url: str,
model: Optional[str],
) -> Dict[str, Any]:
return await _run_xai_video_mutation(
prompt=prompt,
video_url=video_url,
model=model,
endpoint="edits",
operation="edit",
duration=DEFAULT_DURATION,
)
async def _extend_xai_video_async(
*,
prompt: str,
video_url: str,
duration: Optional[int],
model: Optional[str],
) -> Dict[str, Any]:
clamped_duration = _clamp_duration(
duration,
max_seconds=10,
default=DEFAULT_EXTEND_DURATION,
)
return await _run_xai_video_mutation(
prompt=prompt,
video_url=video_url,
model=model,
endpoint="extensions",
operation="extend",
duration=clamped_duration,
)
def _auth_required_response(prompt: str) -> Dict[str, Any]:
return error_response(
error=(
"No xAI credentials found. Sign in via `hermes auth add xai-oauth` "
"(SuperGrok / Premium+) or set XAI_API_KEY from "
"https://console.x.ai/."
),
error_type="auth_required",
provider="xai", prompt=prompt,
)
async def _submit_xai_video_payload(
*,
api_key: str,
base_url: str,
endpoint: str,
payload: Dict[str, Any],
prompt: str,
resolved_model: str,
modality: str,
aspect_ratio: str,
duration: int,
operation: str,
resolution: Optional[str] = None,
) -> Dict[str, Any]:
try:
from tools.xai_http import (
build_xai_storage_options,
maybe_mark_xai_storage_notice_seen,
read_xai_imagine_storage_config,
)
storage_options = build_xai_storage_options(
"video_gen",
filename_prefix="hermes-xai-video",
extension="mp4",
)
storage_notice = maybe_mark_xai_storage_notice_seen("video_gen")
storage_cfg = read_xai_imagine_storage_config("video_gen")
except Exception:
storage_options = None
storage_notice = None
storage_cfg = {"enabled": False}
if storage_options is not None:
payload["storage_options"] = storage_options
async with httpx.AsyncClient() as client:
try:
request_id = await _submit(
client, payload, api_key=api_key, base_url=base_url,
endpoint=endpoint,
)
except httpx.HTTPStatusError as exc:
detail = ""
try:
detail = exc.response.text[:500]
except Exception:
pass
return error_response(
error=f"xAI submit failed ({exc.response.status_code}): {detail or exc}",
error_type="api_error",
provider="xai",
model=resolved_model,
prompt=prompt,
)
message = (
(body.get("error", {}) or {}).get("message")
or body.get("message")
or f"xAI video generation ended with status '{status}'"
poll_result = await _poll(
client, request_id,
api_key=api_key, base_url=base_url,
timeout_seconds=DEFAULT_TIMEOUT_SECONDS,
poll_interval=DEFAULT_POLL_INTERVAL_SECONDS,
)
status = poll_result["status"]
body = poll_result["body"]
if status == "done":
video = body.get("video") or {}
if not isinstance(video, dict):
video = {}
file_output = video.get("file_output") if isinstance(video.get("file_output"), dict) else {}
file_output = file_output or {}
public_video_url, temporary_url, stored_public_url = _xai_video_output_urls(video)
if not public_video_url:
return error_response(
error="xAI video request completed without a video URL",
error_type="empty_response",
provider="xai",
model=body.get("model") or resolved_model,
prompt=prompt,
)
extra: Dict[str, Any] = {
"request_id": request_id,
"operation": operation,
"storage_enabled": bool(storage_cfg.get("enabled")),
}
if resolution:
extra["resolution"] = resolution
if storage_notice:
extra["storage_notice"] = storage_notice
if stored_public_url:
extra["public_url"] = stored_public_url
if temporary_url:
extra["temporary_url"] = temporary_url
if file_output:
for key in (
"filename",
"expires_at",
"public_url_expires_at",
"public_url_error",
"storage_error",
):
if key in file_output:
extra[key] = file_output[key]
if body.get("usage"):
extra["usage"] = body["usage"]
return success_response(
video=public_video_url,
model=body.get("model") or resolved_model,
prompt=prompt,
modality=modality,
aspect_ratio=aspect_ratio,
duration=video.get("duration") or duration,
provider="xai",
extra=extra,
)
if status == "timeout":
return error_response(
error=message,
error_type=f"xai_{status}",
error=f"Timed out waiting for xAI video request after {DEFAULT_TIMEOUT_SECONDS}s",
error_type="timeout",
provider="xai",
model=resolved_model,
prompt=prompt,
)
message = (
(body.get("error", {}) or {}).get("message")
or body.get("message")
or f"xAI video request ended with status '{status}'"
)
return error_response(
error=message,
error_type=f"xai_{status}",
provider="xai",
model=resolved_model,
prompt=prompt,
)
# ---------------------------------------------------------------------------
# Plugin entry point

View file

@ -1,6 +1,6 @@
name: xai
version: 1.0.0
description: "xAI Grok Imagine video generation backend. Supports text-to-video, image-to-video, and reference-image-guided generation via the xAI async videos API."
description: "xAI Grok Imagine video generation backend. Supports text-to-video, image-to-video, reference-to-video, video editing, video extension, and stored public URLs via the xAI async videos API."
author: NousResearch
kind: backend
requires_env: