mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-20 10:11:58 +00:00
* feat(image-gen): add image-to-image / editing to image_generate Brings image generation to parity with video generation: the unified image_generate tool now edits/transforms a source image (image-to-image) when given image_url / reference_image_urls, routing to each backend's edit endpoint, exactly as video_generate routes to image-to-video. - ImageGenProvider ABC: generate() gains keyword-only image_url + reference_image_urls; new capabilities() declares modalities + max_reference_images (defaults to text-only, backward compatible). success_response gains a modality field; adds normalize_reference_images. - image_generate tool: schema exposes image_url + reference_image_urls; dynamic schema reflects the active model's actual edit capability so the agent knows when image_url is honored. Handler + plugin dispatch forward the new inputs; legacy/text-only providers get a clear modality_unsupported error instead of silently dropping the source image. - In-tree FAL: 7 models gain edit endpoints (flux-2-klein, flux-2-pro, nano-banana-pro, gpt-image-1.5, gpt-image-2, ideogram/v3, qwen-image) with per-model edit_supports whitelists + reference caps; routes to the /edit endpoint and skips the upscaler for edits. - Plugins: openai (images.edit, 16 refs), xai (/v1/images/edits via grok-imagine-image-quality, JSON body per xAI docs), krea (image_style_references, 10 refs). openai-codex stays text-only and rejects edits with an actionable error. - Tests: 15 new (payload, routing, dispatch forwarding, dynamic schema, capabilities); updated 2 change-detector/lambda tests for the new schema. - Docs: image-generation feature page, image-gen provider plugin guide, tools reference. * fix(image-gen): preserve legacy passthrough in fal/krea plugin tests Two existing plugin tests asserted pre-image-to-image behavior: - fal: forward image_url/reference_image_urls only when supplied, so a text-to-image delegation stays byte-identical (no None kwargs). - krea: keep dict-shaped image_style_references refs verbatim (the unified string refs go through normalize_reference_images; legacy non-string ref objects pass through unchanged) — fixes KeyError when callers pass the richer Krea ref-object shape. * fix(image-gen): clearer not-capable message for text-to-image-only models When a text-to-image-only model (incl. gpt-image-2 on the Codex OAuth path, which can't do editing through the Responses image_generation tool) gets a source image, say 'this model is not capable of image-to-image / editing — provide a text-only prompt' rather than sending the user shopping for other backends. Applies to the openai-codex guard, the in-tree FAL no-edit-endpoint error, and the dynamic tool-schema text-only line.
414 lines
14 KiB
Python
414 lines
14 KiB
Python
"""OpenAI image generation backend.
|
|
|
|
Exposes OpenAI's ``gpt-image-2`` model at three quality tiers as an
|
|
:class:`ImageGenProvider` implementation. The tiers are implemented as
|
|
three virtual model IDs so the ``hermes tools`` model picker and the
|
|
``image_gen.model`` config key behave like any other multi-model backend:
|
|
|
|
gpt-image-2-low ~15s fastest, good for iteration
|
|
gpt-image-2-medium ~40s default — balanced
|
|
gpt-image-2-high ~2min slowest, highest fidelity
|
|
|
|
All three hit the same underlying API model (``gpt-image-2``) with a
|
|
different ``quality`` parameter. Output is base64 JSON → saved under
|
|
``$HERMES_HOME/cache/images/``.
|
|
|
|
Selection precedence (first hit wins):
|
|
|
|
1. ``OPENAI_IMAGE_MODEL`` env var (escape hatch for scripts / tests)
|
|
2. ``image_gen.openai.model`` in ``config.yaml``
|
|
3. ``image_gen.model`` in ``config.yaml`` (when it's one of our tier IDs)
|
|
4. :data:`DEFAULT_MODEL` — ``gpt-image-2-medium``
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from agent.image_gen_provider import (
|
|
DEFAULT_ASPECT_RATIO,
|
|
ImageGenProvider,
|
|
error_response,
|
|
normalize_reference_images,
|
|
resolve_aspect_ratio,
|
|
save_b64_image,
|
|
save_url_image,
|
|
success_response,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Model catalog
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# All three IDs resolve to the same underlying API model with a different
|
|
# ``quality`` setting. ``api_model`` is what gets sent to OpenAI;
|
|
# ``quality`` is the knob that changes generation time and output fidelity.
|
|
|
|
API_MODEL = "gpt-image-2"
|
|
|
|
_MODELS: Dict[str, Dict[str, Any]] = {
|
|
"gpt-image-2-low": {
|
|
"display": "GPT Image 2 (Low)",
|
|
"speed": "~15s",
|
|
"strengths": "Fast iteration, lowest cost",
|
|
"quality": "low",
|
|
},
|
|
"gpt-image-2-medium": {
|
|
"display": "GPT Image 2 (Medium)",
|
|
"speed": "~40s",
|
|
"strengths": "Balanced — default",
|
|
"quality": "medium",
|
|
},
|
|
"gpt-image-2-high": {
|
|
"display": "GPT Image 2 (High)",
|
|
"speed": "~2min",
|
|
"strengths": "Highest fidelity, strongest prompt adherence",
|
|
"quality": "high",
|
|
},
|
|
}
|
|
|
|
DEFAULT_MODEL = "gpt-image-2-medium"
|
|
|
|
_SIZES = {
|
|
"landscape": "1536x1024",
|
|
"square": "1024x1024",
|
|
"portrait": "1024x1536",
|
|
}
|
|
|
|
|
|
def _load_openai_config() -> Dict[str, Any]:
|
|
"""Read ``image_gen`` from config.yaml (returns {} on any failure)."""
|
|
try:
|
|
from hermes_cli.config import load_config
|
|
|
|
cfg = load_config()
|
|
section = cfg.get("image_gen") if isinstance(cfg, dict) else None
|
|
return section if isinstance(section, dict) else {}
|
|
except Exception as exc:
|
|
logger.debug("Could not load image_gen config: %s", exc)
|
|
return {}
|
|
|
|
|
|
def _resolve_model() -> Tuple[str, Dict[str, Any]]:
|
|
"""Decide which tier to use and return ``(model_id, meta)``."""
|
|
env_override = os.environ.get("OPENAI_IMAGE_MODEL")
|
|
if env_override and env_override in _MODELS:
|
|
return env_override, _MODELS[env_override]
|
|
|
|
cfg = _load_openai_config()
|
|
openai_cfg = cfg.get("openai") if isinstance(cfg.get("openai"), dict) else {}
|
|
candidate: Optional[str] = None
|
|
if isinstance(openai_cfg, dict):
|
|
value = openai_cfg.get("model")
|
|
if isinstance(value, str) and value in _MODELS:
|
|
candidate = value
|
|
if candidate is None:
|
|
top = cfg.get("model")
|
|
if isinstance(top, str) and top in _MODELS:
|
|
candidate = top
|
|
|
|
if candidate is not None:
|
|
return candidate, _MODELS[candidate]
|
|
|
|
return DEFAULT_MODEL, _MODELS[DEFAULT_MODEL]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Source-image loading (for image-to-image / edit)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _load_image_bytes(ref: str) -> Tuple[bytes, str]:
|
|
"""Load image bytes from a URL or local file path.
|
|
|
|
Returns ``(data, filename)``. Raises on any network / IO error so the
|
|
caller can surface a clean error_response.
|
|
"""
|
|
ref = ref.strip()
|
|
lower = ref.lower()
|
|
if lower.startswith(("http://", "https://")):
|
|
import requests
|
|
|
|
resp = requests.get(ref, timeout=60)
|
|
resp.raise_for_status()
|
|
name = ref.split("?", 1)[0].rsplit("/", 1)[-1] or "image.png"
|
|
return resp.content, name
|
|
if lower.startswith("data:"):
|
|
import base64
|
|
|
|
header, _, b64 = ref.partition(",")
|
|
ext = "png"
|
|
if "image/" in header:
|
|
ext = header.split("image/", 1)[1].split(";", 1)[0] or "png"
|
|
return base64.b64decode(b64), f"image.{ext}"
|
|
# Local file path.
|
|
with open(ref, "rb") as fh:
|
|
data = fh.read()
|
|
name = os.path.basename(ref) or "image.png"
|
|
return data, name
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Provider
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class OpenAIImageGenProvider(ImageGenProvider):
|
|
"""OpenAI ``images.generate`` / ``images.edit`` backend — gpt-image-2."""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "openai"
|
|
|
|
@property
|
|
def display_name(self) -> str:
|
|
return "OpenAI"
|
|
|
|
def is_available(self) -> bool:
|
|
if not os.environ.get("OPENAI_API_KEY"):
|
|
return False
|
|
try:
|
|
import openai # noqa: F401
|
|
except ImportError:
|
|
return False
|
|
return True
|
|
|
|
def list_models(self) -> List[Dict[str, Any]]:
|
|
return [
|
|
{
|
|
"id": model_id,
|
|
"display": meta["display"],
|
|
"speed": meta["speed"],
|
|
"strengths": meta["strengths"],
|
|
"price": "varies",
|
|
}
|
|
for model_id, meta in _MODELS.items()
|
|
]
|
|
|
|
def default_model(self) -> Optional[str]:
|
|
return DEFAULT_MODEL
|
|
|
|
def get_setup_schema(self) -> Dict[str, Any]:
|
|
return {
|
|
"name": "OpenAI",
|
|
"badge": "paid",
|
|
"tag": "gpt-image-2 at low/medium/high quality tiers — text-to-image & image editing",
|
|
"env_vars": [
|
|
{
|
|
"key": "OPENAI_API_KEY",
|
|
"prompt": "OpenAI API key",
|
|
"url": "https://platform.openai.com/api-keys",
|
|
},
|
|
],
|
|
}
|
|
|
|
def capabilities(self) -> Dict[str, Any]:
|
|
# gpt-image-2 supports editing via images.edit() with up to 16 source
|
|
# images.
|
|
return {"modalities": ["text", "image"], "max_reference_images": 16}
|
|
|
|
def generate(
|
|
self,
|
|
prompt: str,
|
|
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
|
*,
|
|
image_url: Optional[str] = None,
|
|
reference_image_urls: Optional[List[str]] = None,
|
|
**kwargs: Any,
|
|
) -> Dict[str, Any]:
|
|
prompt = (prompt or "").strip()
|
|
aspect = resolve_aspect_ratio(aspect_ratio)
|
|
|
|
if not prompt:
|
|
return error_response(
|
|
error="Prompt is required and must be a non-empty string",
|
|
error_type="invalid_argument",
|
|
provider="openai",
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
if not os.environ.get("OPENAI_API_KEY"):
|
|
return error_response(
|
|
error=(
|
|
"OPENAI_API_KEY not set. Run `hermes tools` → Image "
|
|
"Generation → OpenAI to configure, or `hermes setup` "
|
|
"to add the key."
|
|
),
|
|
error_type="auth_required",
|
|
provider="openai",
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
try:
|
|
import openai
|
|
except ImportError:
|
|
return error_response(
|
|
error="openai Python package not installed (pip install openai)",
|
|
error_type="missing_dependency",
|
|
provider="openai",
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
tier_id, meta = _resolve_model()
|
|
size = _SIZES.get(aspect, _SIZES["square"])
|
|
|
|
# Collect source images (primary + references) for image-to-image.
|
|
sources: List[str] = []
|
|
if isinstance(image_url, str) and image_url.strip():
|
|
sources.append(image_url.strip())
|
|
for ref in (normalize_reference_images(reference_image_urls) or []):
|
|
sources.append(ref)
|
|
sources = sources[:16] # gpt-image-2 edit caps at 16 images
|
|
is_edit = bool(sources)
|
|
modality = "image" if is_edit else "text"
|
|
|
|
client = openai.OpenAI()
|
|
|
|
if is_edit:
|
|
# images.edit() expects file-like objects. Download/read each
|
|
# source into a named BytesIO so the SDK sends correct multipart.
|
|
import io
|
|
|
|
try:
|
|
files = []
|
|
for ref in sources:
|
|
data, fname = _load_image_bytes(ref)
|
|
bio = io.BytesIO(data)
|
|
bio.name = fname
|
|
files.append(bio)
|
|
except Exception as exc:
|
|
return error_response(
|
|
error=f"Could not load source image for editing: {exc}",
|
|
error_type="io_error",
|
|
provider="openai",
|
|
model=tier_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
try:
|
|
response = client.images.edit(
|
|
model=API_MODEL,
|
|
image=files if len(files) > 1 else files[0],
|
|
prompt=prompt,
|
|
size=size, # type: ignore[arg-type] # _SIZES values are valid gpt-image sizes
|
|
quality=meta["quality"],
|
|
n=1,
|
|
)
|
|
except Exception as exc:
|
|
logger.debug("OpenAI image edit failed", exc_info=True)
|
|
return error_response(
|
|
error=f"OpenAI image editing failed: {exc}",
|
|
error_type="api_error",
|
|
provider="openai",
|
|
model=tier_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
else:
|
|
# gpt-image-2 returns b64_json unconditionally and REJECTS
|
|
# ``response_format`` as an unknown parameter. Don't send it.
|
|
payload: Dict[str, Any] = {
|
|
"model": API_MODEL,
|
|
"prompt": prompt,
|
|
"size": size,
|
|
"n": 1,
|
|
"quality": meta["quality"],
|
|
}
|
|
|
|
try:
|
|
response = client.images.generate(**payload)
|
|
except Exception as exc:
|
|
logger.debug("OpenAI image generation failed", exc_info=True)
|
|
return error_response(
|
|
error=f"OpenAI image generation failed: {exc}",
|
|
error_type="api_error",
|
|
provider="openai",
|
|
model=tier_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
data = getattr(response, "data", None) or []
|
|
if not data:
|
|
return error_response(
|
|
error="OpenAI returned no image data",
|
|
error_type="empty_response",
|
|
provider="openai",
|
|
model=tier_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
first = data[0]
|
|
b64 = getattr(first, "b64_json", None)
|
|
url = getattr(first, "url", None)
|
|
revised_prompt = getattr(first, "revised_prompt", None)
|
|
|
|
if b64:
|
|
try:
|
|
saved_path = save_b64_image(b64, prefix=f"openai_{tier_id}")
|
|
except Exception as exc:
|
|
return error_response(
|
|
error=f"Could not save image to cache: {exc}",
|
|
error_type="io_error",
|
|
provider="openai",
|
|
model=tier_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
image_ref = str(saved_path)
|
|
elif url:
|
|
# Defensive — gpt-image-2 returns b64 today, but OpenAI's API
|
|
# has previously returned URLs. Cache the bytes locally so the
|
|
# gateway never tries to fetch an ephemeral / signed URL after
|
|
# it expires — same rationale as the xAI provider (#26942).
|
|
try:
|
|
saved_path = save_url_image(url, prefix=f"openai_{tier_id}")
|
|
except Exception as exc:
|
|
logger.warning(
|
|
"OpenAI image URL %s could not be cached (%s); falling back to bare URL.",
|
|
url,
|
|
exc,
|
|
)
|
|
image_ref = url
|
|
else:
|
|
image_ref = str(saved_path)
|
|
else:
|
|
return error_response(
|
|
error="OpenAI response contained neither b64_json nor URL",
|
|
error_type="empty_response",
|
|
provider="openai",
|
|
model=tier_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
extra: Dict[str, Any] = {"size": size, "quality": meta["quality"]}
|
|
if revised_prompt:
|
|
extra["revised_prompt"] = revised_prompt
|
|
|
|
return success_response(
|
|
image=image_ref,
|
|
model=tier_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
provider="openai",
|
|
modality=modality,
|
|
extra=extra,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Plugin entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def register(ctx) -> None:
|
|
"""Plugin entry point — wire ``OpenAIImageGenProvider`` into the registry."""
|
|
ctx.register_image_gen_provider(OpenAIImageGenProvider())
|