mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
feat(image-gen): add image-to-image / editing to image_generate (#48705)
* feat(image-gen): add image-to-image / editing to image_generate Brings image generation to parity with video generation: the unified image_generate tool now edits/transforms a source image (image-to-image) when given image_url / reference_image_urls, routing to each backend's edit endpoint, exactly as video_generate routes to image-to-video. - ImageGenProvider ABC: generate() gains keyword-only image_url + reference_image_urls; new capabilities() declares modalities + max_reference_images (defaults to text-only, backward compatible). success_response gains a modality field; adds normalize_reference_images. - image_generate tool: schema exposes image_url + reference_image_urls; dynamic schema reflects the active model's actual edit capability so the agent knows when image_url is honored. Handler + plugin dispatch forward the new inputs; legacy/text-only providers get a clear modality_unsupported error instead of silently dropping the source image. - In-tree FAL: 7 models gain edit endpoints (flux-2-klein, flux-2-pro, nano-banana-pro, gpt-image-1.5, gpt-image-2, ideogram/v3, qwen-image) with per-model edit_supports whitelists + reference caps; routes to the /edit endpoint and skips the upscaler for edits. - Plugins: openai (images.edit, 16 refs), xai (/v1/images/edits via grok-imagine-image-quality, JSON body per xAI docs), krea (image_style_references, 10 refs). openai-codex stays text-only and rejects edits with an actionable error. - Tests: 15 new (payload, routing, dispatch forwarding, dynamic schema, capabilities); updated 2 change-detector/lambda tests for the new schema. - Docs: image-generation feature page, image-gen provider plugin guide, tools reference. * fix(image-gen): preserve legacy passthrough in fal/krea plugin tests Two existing plugin tests asserted pre-image-to-image behavior: - fal: forward image_url/reference_image_urls only when supplied, so a text-to-image delegation stays byte-identical (no None kwargs). - krea: keep dict-shaped image_style_references refs verbatim (the unified string refs go through normalize_reference_images; legacy non-string ref objects pass through unchanged) — fixes KeyError when callers pass the richer Krea ref-object shape. * fix(image-gen): clearer not-capable message for text-to-image-only models When a text-to-image-only model (incl. gpt-image-2 on the Codex OAuth path, which can't do editing through the Responses image_generation tool) gets a source image, say 'this model is not capable of image-to-image / editing — provide a text-only prompt' rather than sending the user shopping for other backends. Applies to the openai-codex guard, the in-tree FAL no-edit-endpoint error, and the dynamic tool-schema text-only line.
This commit is contained in:
parent
cfb55de5ea
commit
c02192ff6a
13 changed files with 1239 additions and 106 deletions
|
|
@ -87,7 +87,7 @@ class FalImageGenProvider(ImageGenProvider):
|
|||
return {
|
||||
"name": "FAL.ai",
|
||||
"badge": "paid",
|
||||
"tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc.",
|
||||
"tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc. — text-to-image & image editing",
|
||||
"env_vars": [
|
||||
{
|
||||
"key": "FAL_KEY",
|
||||
|
|
@ -97,18 +97,40 @@ class FalImageGenProvider(ImageGenProvider):
|
|||
],
|
||||
}
|
||||
|
||||
def capabilities(self) -> Dict[str, Any]:
|
||||
# Whether image-to-image is available depends on the currently-
|
||||
# selected FAL model (each model entry declares an edit_endpoint or
|
||||
# not). Report the active model's actual surface so the dynamic tool
|
||||
# schema is accurate.
|
||||
import tools.image_generation_tool as _it
|
||||
|
||||
try:
|
||||
_model_id, meta = _it._resolve_fal_model()
|
||||
except Exception: # noqa: BLE001
|
||||
return {"modalities": ["text"], "max_reference_images": 0}
|
||||
if meta.get("edit_endpoint"):
|
||||
return {
|
||||
"modalities": ["text", "image"],
|
||||
"max_reference_images": int(meta.get("max_reference_images") or 1),
|
||||
}
|
||||
return {"modalities": ["text"], "max_reference_images": 0}
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
||||
*,
|
||||
image_url: Optional[str] = None,
|
||||
reference_image_urls: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate an image via the legacy FAL pipeline.
|
||||
"""Generate or edit an image via the legacy FAL pipeline.
|
||||
|
||||
Forwards prompt + aspect_ratio (and any forward-compat extras
|
||||
the schema supports) into :func:`tools.image_generation_tool.image_generate_tool`,
|
||||
then reshapes its JSON-string response into the provider-ABC
|
||||
dict format consumed by ``_dispatch_to_plugin_provider``.
|
||||
Forwards prompt + aspect_ratio + image_url/reference_image_urls (and
|
||||
any forward-compat extras the schema supports) into
|
||||
:func:`tools.image_generation_tool.image_generate_tool`, then reshapes
|
||||
its JSON-string response into the provider-ABC dict format consumed by
|
||||
``_dispatch_to_plugin_provider``.
|
||||
"""
|
||||
import tools.image_generation_tool as _it
|
||||
|
||||
|
|
@ -124,6 +146,13 @@ class FalImageGenProvider(ImageGenProvider):
|
|||
)
|
||||
if key in kwargs and kwargs[key] is not None
|
||||
}
|
||||
# Only forward the image-to-image inputs when actually supplied, so a
|
||||
# plain text-to-image call delegates exactly as it did before (no
|
||||
# noisy None kwargs).
|
||||
if image_url is not None:
|
||||
passthrough["image_url"] = image_url
|
||||
if reference_image_urls is not None:
|
||||
passthrough["reference_image_urls"] = reference_image_urls
|
||||
|
||||
try:
|
||||
raw = _it.image_generate_tool(
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ from agent.image_gen_provider import (
|
|||
DEFAULT_ASPECT_RATIO,
|
||||
ImageGenProvider,
|
||||
error_response,
|
||||
normalize_reference_images,
|
||||
resolve_aspect_ratio,
|
||||
save_url_image,
|
||||
success_response,
|
||||
|
|
@ -191,7 +192,7 @@ class KreaImageGenProvider(ImageGenProvider):
|
|||
return {
|
||||
"name": "Krea",
|
||||
"badge": "paid",
|
||||
"tag": "Krea 2 foundation model — Medium ($0.03) + Large ($0.06). Strong style transfer + moodboards.",
|
||||
"tag": "Krea 2 foundation model — Medium ($0.03) + Large ($0.06). Style transfer, moodboards, reference-guided generation.",
|
||||
"env_vars": [
|
||||
{
|
||||
"key": "KREA_API_KEY",
|
||||
|
|
@ -201,6 +202,11 @@ class KreaImageGenProvider(ImageGenProvider):
|
|||
],
|
||||
}
|
||||
|
||||
def capabilities(self) -> Dict[str, Any]:
|
||||
# Krea supports reference-guided generation (image-to-image style
|
||||
# transfer) via image_style_references — up to 10 refs.
|
||||
return {"modalities": ["text", "image"], "max_reference_images": 10}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# generate()
|
||||
# ------------------------------------------------------------------
|
||||
|
|
@ -209,12 +215,48 @@ class KreaImageGenProvider(ImageGenProvider):
|
|||
self,
|
||||
prompt: str,
|
||||
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
||||
*,
|
||||
image_url: Optional[str] = None,
|
||||
reference_image_urls: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
prompt = (prompt or "").strip()
|
||||
aspect = resolve_aspect_ratio(aspect_ratio)
|
||||
krea_ar = _ASPECT_MAP.get(aspect, "1:1")
|
||||
|
||||
# Collect reference images for reference-guided generation (image-to-
|
||||
# image style transfer). Sources, in order:
|
||||
# 1. unified image_url (primary source) + reference_image_urls (strings)
|
||||
# 2. legacy image_style_references kwarg — may be plain URL strings OR
|
||||
# Krea's richer ref objects (e.g. {"url": ..., "strength": ...}),
|
||||
# which are passed through verbatim for backward compatibility.
|
||||
style_refs: List[Any] = []
|
||||
if isinstance(image_url, str) and image_url.strip():
|
||||
style_refs.append(image_url.strip())
|
||||
for ref in (normalize_reference_images(reference_image_urls) or []):
|
||||
style_refs.append(ref)
|
||||
legacy_refs = kwargs.get("image_style_references")
|
||||
if isinstance(legacy_refs, list):
|
||||
for ref in legacy_refs:
|
||||
if isinstance(ref, str):
|
||||
if ref.strip():
|
||||
style_refs.append(ref.strip())
|
||||
elif ref:
|
||||
# Non-string ref object (dict, etc.) — pass through as-is.
|
||||
style_refs.append(ref)
|
||||
# Dedupe string entries while preserving order (dict refs aren't
|
||||
# hashable, so they're kept verbatim); Krea caps at 10.
|
||||
seen: set = set()
|
||||
deduped: List[Any] = []
|
||||
for r in style_refs:
|
||||
if isinstance(r, str):
|
||||
if r in seen:
|
||||
continue
|
||||
seen.add(r)
|
||||
deduped.append(r)
|
||||
style_refs = deduped[:10]
|
||||
modality = "image" if style_refs else "text"
|
||||
|
||||
if not prompt:
|
||||
return error_response(
|
||||
error="Prompt is required and must be a non-empty string",
|
||||
|
|
@ -256,10 +298,10 @@ class KreaImageGenProvider(ImageGenProvider):
|
|||
if isinstance(styles, list) and styles:
|
||||
payload["styles"] = styles
|
||||
|
||||
image_style_references = kwargs.get("image_style_references")
|
||||
if isinstance(image_style_references, list) and image_style_references:
|
||||
# Krea caps at 10 refs per request.
|
||||
payload["image_style_references"] = image_style_references[:10]
|
||||
if style_refs:
|
||||
# Reference-guided generation (image-to-image style transfer).
|
||||
# Krea caps at 10 refs per request (already clamped above).
|
||||
payload["image_style_references"] = style_refs
|
||||
|
||||
moodboards = kwargs.get("moodboards")
|
||||
if isinstance(moodboards, list) and moodboards:
|
||||
|
|
@ -483,19 +525,19 @@ class KreaImageGenProvider(ImageGenProvider):
|
|||
# Per Krea's job-lifecycle docs the completed payload exposes
|
||||
# ``result.urls`` (an array). Fall back to a single ``url`` field
|
||||
# for forward/backward compatibility.
|
||||
image_url: Optional[str] = None
|
||||
result_image_url: Optional[str] = None
|
||||
urls = result.get("urls")
|
||||
if isinstance(urls, list) and urls:
|
||||
for candidate in urls:
|
||||
if isinstance(candidate, str) and candidate.strip():
|
||||
image_url = candidate.strip()
|
||||
result_image_url = candidate.strip()
|
||||
break
|
||||
if image_url is None:
|
||||
if result_image_url is None:
|
||||
single = result.get("url")
|
||||
if isinstance(single, str) and single.strip():
|
||||
image_url = single.strip()
|
||||
result_image_url = single.strip()
|
||||
|
||||
if image_url is None:
|
||||
if result_image_url is None:
|
||||
return error_response(
|
||||
error="Krea result contained no image URL",
|
||||
error_type="empty_response",
|
||||
|
|
@ -508,14 +550,14 @@ class KreaImageGenProvider(ImageGenProvider):
|
|||
# Materialise locally — Krea result URLs may expire, mirroring
|
||||
# what we do for xAI / OpenAI URL responses (#26942).
|
||||
try:
|
||||
saved_path = save_url_image(image_url, prefix=f"krea_{model_id}")
|
||||
saved_path = save_url_image(result_image_url, prefix=f"krea_{model_id}")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"Krea image URL %s could not be cached (%s); falling back to bare URL.",
|
||||
image_url,
|
||||
result_image_url,
|
||||
exc,
|
||||
)
|
||||
image_ref = image_url
|
||||
image_ref = result_image_url
|
||||
else:
|
||||
image_ref = str(saved_path)
|
||||
|
||||
|
|
@ -534,6 +576,7 @@ class KreaImageGenProvider(ImageGenProvider):
|
|||
prompt=prompt,
|
||||
aspect_ratio=aspect,
|
||||
provider="krea",
|
||||
modality=modality,
|
||||
extra=extra,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -319,7 +319,7 @@ class OpenAICodexImageGenProvider(ImageGenProvider):
|
|||
return {
|
||||
"name": "OpenAI (Codex auth)",
|
||||
"badge": "free",
|
||||
"tag": "gpt-image-2 via ChatGPT/Codex OAuth — no API key required",
|
||||
"tag": "gpt-image-2 via ChatGPT/Codex OAuth — no API key required (text-to-image only)",
|
||||
"env_vars": [],
|
||||
"post_setup_hint": (
|
||||
"Sign in with `hermes auth codex` (or `hermes setup` → Codex) "
|
||||
|
|
@ -327,15 +327,41 @@ class OpenAICodexImageGenProvider(ImageGenProvider):
|
|||
),
|
||||
}
|
||||
|
||||
def capabilities(self) -> Dict[str, Any]:
|
||||
# The Codex Responses image_generation tool path is text-to-image
|
||||
# only here. Image-to-image / editing via Codex OAuth is not wired —
|
||||
# users who need editing should use the `openai` (API key), `fal`, or
|
||||
# `xai` backends. Declaring text-only keeps the dynamic tool schema
|
||||
# honest so the model doesn't attempt an unsupported edit.
|
||||
return {"modalities": ["text"], "max_reference_images": 0}
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
||||
*,
|
||||
image_url: Optional[str] = None,
|
||||
reference_image_urls: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
prompt = (prompt or "").strip()
|
||||
aspect = resolve_aspect_ratio(aspect_ratio)
|
||||
|
||||
# Image-to-image / editing is not supported on the Codex OAuth path.
|
||||
# Surface a clear, actionable error instead of silently ignoring the
|
||||
# source image and producing an unrelated picture.
|
||||
if (isinstance(image_url, str) and image_url.strip()) or reference_image_urls:
|
||||
return error_response(
|
||||
error=(
|
||||
"This model is not capable of image-to-image / editing. "
|
||||
"Please provide a text-only prompt (drop image_url and "
|
||||
"reference_image_urls)."
|
||||
),
|
||||
error_type="modality_unsupported",
|
||||
provider="openai-codex",
|
||||
aspect_ratio=aspect,
|
||||
)
|
||||
|
||||
if not prompt:
|
||||
return error_response(
|
||||
error="Prompt is required and must be a non-empty string",
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ from agent.image_gen_provider import (
|
|||
DEFAULT_ASPECT_RATIO,
|
||||
ImageGenProvider,
|
||||
error_response,
|
||||
normalize_reference_images,
|
||||
resolve_aspect_ratio,
|
||||
save_b64_image,
|
||||
save_url_image,
|
||||
|
|
@ -117,13 +118,48 @@ def _resolve_model() -> Tuple[str, Dict[str, Any]]:
|
|||
return DEFAULT_MODEL, _MODELS[DEFAULT_MODEL]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Source-image loading (for image-to-image / edit)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _load_image_bytes(ref: str) -> Tuple[bytes, str]:
|
||||
"""Load image bytes from a URL or local file path.
|
||||
|
||||
Returns ``(data, filename)``. Raises on any network / IO error so the
|
||||
caller can surface a clean error_response.
|
||||
"""
|
||||
ref = ref.strip()
|
||||
lower = ref.lower()
|
||||
if lower.startswith(("http://", "https://")):
|
||||
import requests
|
||||
|
||||
resp = requests.get(ref, timeout=60)
|
||||
resp.raise_for_status()
|
||||
name = ref.split("?", 1)[0].rsplit("/", 1)[-1] or "image.png"
|
||||
return resp.content, name
|
||||
if lower.startswith("data:"):
|
||||
import base64
|
||||
|
||||
header, _, b64 = ref.partition(",")
|
||||
ext = "png"
|
||||
if "image/" in header:
|
||||
ext = header.split("image/", 1)[1].split(";", 1)[0] or "png"
|
||||
return base64.b64decode(b64), f"image.{ext}"
|
||||
# Local file path.
|
||||
with open(ref, "rb") as fh:
|
||||
data = fh.read()
|
||||
name = os.path.basename(ref) or "image.png"
|
||||
return data, name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class OpenAIImageGenProvider(ImageGenProvider):
|
||||
"""OpenAI ``images.generate`` backend — gpt-image-2 at low/medium/high."""
|
||||
"""OpenAI ``images.generate`` / ``images.edit`` backend — gpt-image-2."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
|
|
@ -161,7 +197,7 @@ class OpenAIImageGenProvider(ImageGenProvider):
|
|||
return {
|
||||
"name": "OpenAI",
|
||||
"badge": "paid",
|
||||
"tag": "gpt-image-2 at low/medium/high quality tiers",
|
||||
"tag": "gpt-image-2 at low/medium/high quality tiers — text-to-image & image editing",
|
||||
"env_vars": [
|
||||
{
|
||||
"key": "OPENAI_API_KEY",
|
||||
|
|
@ -171,10 +207,18 @@ class OpenAIImageGenProvider(ImageGenProvider):
|
|||
],
|
||||
}
|
||||
|
||||
def capabilities(self) -> Dict[str, Any]:
|
||||
# gpt-image-2 supports editing via images.edit() with up to 16 source
|
||||
# images.
|
||||
return {"modalities": ["text", "image"], "max_reference_images": 16}
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
||||
*,
|
||||
image_url: Optional[str] = None,
|
||||
reference_image_urls: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
prompt = (prompt or "").strip()
|
||||
|
|
@ -213,29 +257,82 @@ class OpenAIImageGenProvider(ImageGenProvider):
|
|||
tier_id, meta = _resolve_model()
|
||||
size = _SIZES.get(aspect, _SIZES["square"])
|
||||
|
||||
# gpt-image-2 returns b64_json unconditionally and REJECTS
|
||||
# ``response_format`` as an unknown parameter. Don't send it.
|
||||
payload: Dict[str, Any] = {
|
||||
"model": API_MODEL,
|
||||
"prompt": prompt,
|
||||
"size": size,
|
||||
"n": 1,
|
||||
"quality": meta["quality"],
|
||||
}
|
||||
# Collect source images (primary + references) for image-to-image.
|
||||
sources: List[str] = []
|
||||
if isinstance(image_url, str) and image_url.strip():
|
||||
sources.append(image_url.strip())
|
||||
for ref in (normalize_reference_images(reference_image_urls) or []):
|
||||
sources.append(ref)
|
||||
sources = sources[:16] # gpt-image-2 edit caps at 16 images
|
||||
is_edit = bool(sources)
|
||||
modality = "image" if is_edit else "text"
|
||||
|
||||
try:
|
||||
client = openai.OpenAI()
|
||||
response = client.images.generate(**payload)
|
||||
except Exception as exc:
|
||||
logger.debug("OpenAI image generation failed", exc_info=True)
|
||||
return error_response(
|
||||
error=f"OpenAI image generation failed: {exc}",
|
||||
error_type="api_error",
|
||||
provider="openai",
|
||||
model=tier_id,
|
||||
prompt=prompt,
|
||||
aspect_ratio=aspect,
|
||||
)
|
||||
client = openai.OpenAI()
|
||||
|
||||
if is_edit:
|
||||
# images.edit() expects file-like objects. Download/read each
|
||||
# source into a named BytesIO so the SDK sends correct multipart.
|
||||
import io
|
||||
|
||||
try:
|
||||
files = []
|
||||
for ref in sources:
|
||||
data, fname = _load_image_bytes(ref)
|
||||
bio = io.BytesIO(data)
|
||||
bio.name = fname
|
||||
files.append(bio)
|
||||
except Exception as exc:
|
||||
return error_response(
|
||||
error=f"Could not load source image for editing: {exc}",
|
||||
error_type="io_error",
|
||||
provider="openai",
|
||||
model=tier_id,
|
||||
prompt=prompt,
|
||||
aspect_ratio=aspect,
|
||||
)
|
||||
|
||||
try:
|
||||
response = client.images.edit(
|
||||
model=API_MODEL,
|
||||
image=files if len(files) > 1 else files[0],
|
||||
prompt=prompt,
|
||||
size=size, # type: ignore[arg-type] # _SIZES values are valid gpt-image sizes
|
||||
quality=meta["quality"],
|
||||
n=1,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("OpenAI image edit failed", exc_info=True)
|
||||
return error_response(
|
||||
error=f"OpenAI image editing failed: {exc}",
|
||||
error_type="api_error",
|
||||
provider="openai",
|
||||
model=tier_id,
|
||||
prompt=prompt,
|
||||
aspect_ratio=aspect,
|
||||
)
|
||||
else:
|
||||
# gpt-image-2 returns b64_json unconditionally and REJECTS
|
||||
# ``response_format`` as an unknown parameter. Don't send it.
|
||||
payload: Dict[str, Any] = {
|
||||
"model": API_MODEL,
|
||||
"prompt": prompt,
|
||||
"size": size,
|
||||
"n": 1,
|
||||
"quality": meta["quality"],
|
||||
}
|
||||
|
||||
try:
|
||||
response = client.images.generate(**payload)
|
||||
except Exception as exc:
|
||||
logger.debug("OpenAI image generation failed", exc_info=True)
|
||||
return error_response(
|
||||
error=f"OpenAI image generation failed: {exc}",
|
||||
error_type="api_error",
|
||||
provider="openai",
|
||||
model=tier_id,
|
||||
prompt=prompt,
|
||||
aspect_ratio=aspect,
|
||||
)
|
||||
|
||||
data = getattr(response, "data", None) or []
|
||||
if not data:
|
||||
|
|
@ -302,6 +399,7 @@ class OpenAIImageGenProvider(ImageGenProvider):
|
|||
prompt=prompt,
|
||||
aspect_ratio=aspect,
|
||||
provider="openai",
|
||||
modality=modality,
|
||||
extra=extra,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ from agent.image_gen_provider import (
|
|||
DEFAULT_ASPECT_RATIO,
|
||||
ImageGenProvider,
|
||||
error_response,
|
||||
normalize_reference_images,
|
||||
resolve_aspect_ratio,
|
||||
save_b64_image,
|
||||
save_url_image,
|
||||
|
|
@ -114,6 +115,31 @@ def _resolve_resolution() -> str:
|
|||
return DEFAULT_RESOLUTION
|
||||
|
||||
|
||||
def _xai_image_field(source: str) -> Dict[str, str]:
|
||||
"""Build the xAI ``image`` field for an edit request.
|
||||
|
||||
xAI's ``/v1/images/edits`` accepts ``{"url": <ref>, "type": "image_url"}``
|
||||
where ``<ref>`` is a public URL or a base64 data URI. Public URLs and
|
||||
existing data URIs pass through unchanged; local file paths are read and
|
||||
encoded into a ``data:`` URI.
|
||||
"""
|
||||
source = source.strip()
|
||||
lower = source.lower()
|
||||
if lower.startswith(("http://", "https://", "data:")):
|
||||
return {"url": source, "type": "image_url"}
|
||||
# Local file path → base64 data URI.
|
||||
import base64
|
||||
import os as _os
|
||||
|
||||
with open(source, "rb") as fh:
|
||||
raw = fh.read()
|
||||
ext = (_os.path.splitext(source)[1].lstrip(".") or "png").lower()
|
||||
if ext == "jpg":
|
||||
ext = "jpeg"
|
||||
b64 = base64.b64encode(raw).decode("utf-8")
|
||||
return {"url": f"data:image/{ext};base64,{b64}", "type": "image_url"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -153,18 +179,34 @@ class XAIImageGenProvider(ImageGenProvider):
|
|||
return {
|
||||
"name": "xAI Grok Imagine (image)",
|
||||
"badge": "paid",
|
||||
"tag": "grok-imagine-image — text-to-image; uses xAI Grok OAuth or XAI_API_KEY",
|
||||
"tag": "grok-imagine-image — text-to-image & image editing; uses xAI Grok OAuth or XAI_API_KEY",
|
||||
"env_vars": [],
|
||||
"post_setup": "xai_grok",
|
||||
}
|
||||
|
||||
def capabilities(self) -> Dict[str, Any]:
|
||||
# xAI's /v1/images/edits supports image editing via grok-imagine-image
|
||||
# -quality. Single primary source image (multi-image editing exists as
|
||||
# a separate capability but we keep the primary edit surface here).
|
||||
return {"modalities": ["text", "image"], "max_reference_images": 1}
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
||||
*,
|
||||
image_url: Optional[str] = None,
|
||||
reference_image_urls: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate an image using xAI's grok-imagine-image."""
|
||||
"""Generate an image (text-to-image) or edit a source image (image-to-image).
|
||||
|
||||
Routing: when ``image_url`` is provided, POST to ``/v1/images/edits``
|
||||
with the source image; otherwise POST to ``/v1/images/generations``.
|
||||
Per xAI docs, editing uses the ``grok-imagine-image-quality`` model and
|
||||
a JSON body (the OpenAI SDK's multipart ``images.edit()`` is NOT
|
||||
supported by xAI).
|
||||
"""
|
||||
creds = resolve_xai_http_credentials()
|
||||
api_key = str(creds.get("api_key") or "").strip()
|
||||
provider_name = str(creds.get("provider") or "xai").strip() or "xai"
|
||||
|
|
@ -182,12 +224,17 @@ class XAIImageGenProvider(ImageGenProvider):
|
|||
resolution = _resolve_resolution()
|
||||
xai_res = resolution if resolution in _XAI_RESOLUTIONS else DEFAULT_RESOLUTION
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"model": model_id,
|
||||
"prompt": prompt,
|
||||
"aspect_ratio": xai_ar,
|
||||
"resolution": xai_res,
|
||||
}
|
||||
# Pick the primary source image: explicit image_url wins, else the
|
||||
# first reference image.
|
||||
source_image = None
|
||||
if isinstance(image_url, str) and image_url.strip():
|
||||
source_image = image_url.strip()
|
||||
else:
|
||||
refs = normalize_reference_images(reference_image_urls)
|
||||
if refs:
|
||||
source_image = refs[0]
|
||||
is_edit = bool(source_image)
|
||||
modality = "image" if is_edit else "text"
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
|
|
@ -197,9 +244,41 @@ class XAIImageGenProvider(ImageGenProvider):
|
|||
|
||||
base_url = str(creds.get("base_url") or "https://api.x.ai/v1").strip().rstrip("/")
|
||||
|
||||
if is_edit:
|
||||
# Editing requires the quality model per xAI docs. The source
|
||||
# image may be a public URL or a base64 data URI; local file paths
|
||||
# are converted to a data URI here.
|
||||
edit_model = "grok-imagine-image-quality"
|
||||
try:
|
||||
image_field = _xai_image_field(source_image)
|
||||
except Exception as exc:
|
||||
return error_response(
|
||||
error=f"Could not load source image for editing: {exc}",
|
||||
error_type="io_error",
|
||||
provider=provider_name,
|
||||
model=edit_model,
|
||||
prompt=prompt,
|
||||
aspect_ratio=aspect,
|
||||
)
|
||||
payload: Dict[str, Any] = {
|
||||
"model": edit_model,
|
||||
"prompt": prompt,
|
||||
"image": image_field,
|
||||
}
|
||||
endpoint_url = f"{base_url}/images/edits"
|
||||
model_id = edit_model
|
||||
else:
|
||||
payload = {
|
||||
"model": model_id,
|
||||
"prompt": prompt,
|
||||
"aspect_ratio": xai_ar,
|
||||
"resolution": xai_res,
|
||||
}
|
||||
endpoint_url = f"{base_url}/images/generations"
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{base_url}/images/generations",
|
||||
endpoint_url,
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=120,
|
||||
|
|
@ -310,9 +389,9 @@ class XAIImageGenProvider(ImageGenProvider):
|
|||
aspect_ratio=aspect,
|
||||
)
|
||||
|
||||
extra: Dict[str, Any] = {
|
||||
"resolution": xai_res,
|
||||
}
|
||||
extra: Dict[str, Any] = {}
|
||||
if not is_edit:
|
||||
extra["resolution"] = xai_res
|
||||
|
||||
return success_response(
|
||||
image=image_ref,
|
||||
|
|
@ -320,6 +399,7 @@ class XAIImageGenProvider(ImageGenProvider):
|
|||
prompt=prompt,
|
||||
aspect_ratio=aspect,
|
||||
provider="xai",
|
||||
modality=modality,
|
||||
extra=extra,
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue