mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
* feat(image-gen): add image-to-image / editing to image_generate Brings image generation to parity with video generation: the unified image_generate tool now edits/transforms a source image (image-to-image) when given image_url / reference_image_urls, routing to each backend's edit endpoint, exactly as video_generate routes to image-to-video. - ImageGenProvider ABC: generate() gains keyword-only image_url + reference_image_urls; new capabilities() declares modalities + max_reference_images (defaults to text-only, backward compatible). success_response gains a modality field; adds normalize_reference_images. - image_generate tool: schema exposes image_url + reference_image_urls; dynamic schema reflects the active model's actual edit capability so the agent knows when image_url is honored. Handler + plugin dispatch forward the new inputs; legacy/text-only providers get a clear modality_unsupported error instead of silently dropping the source image. - In-tree FAL: 7 models gain edit endpoints (flux-2-klein, flux-2-pro, nano-banana-pro, gpt-image-1.5, gpt-image-2, ideogram/v3, qwen-image) with per-model edit_supports whitelists + reference caps; routes to the /edit endpoint and skips the upscaler for edits. - Plugins: openai (images.edit, 16 refs), xai (/v1/images/edits via grok-imagine-image-quality, JSON body per xAI docs), krea (image_style_references, 10 refs). openai-codex stays text-only and rejects edits with an actionable error. - Tests: 15 new (payload, routing, dispatch forwarding, dynamic schema, capabilities); updated 2 change-detector/lambda tests for the new schema. - Docs: image-generation feature page, image-gen provider plugin guide, tools reference. * fix(image-gen): preserve legacy passthrough in fal/krea plugin tests Two existing plugin tests asserted pre-image-to-image behavior: - fal: forward image_url/reference_image_urls only when supplied, so a text-to-image delegation stays byte-identical (no None kwargs). - krea: keep dict-shaped image_style_references refs verbatim (the unified string refs go through normalize_reference_images; legacy non-string ref objects pass through unchanged) — fixes KeyError when callers pass the richer Krea ref-object shape. * fix(image-gen): clearer not-capable message for text-to-image-only models When a text-to-image-only model (incl. gpt-image-2 on the Codex OAuth path, which can't do editing through the Responses image_generation tool) gets a source image, say 'this model is not capable of image-to-image / editing — provide a text-only prompt' rather than sending the user shopping for other backends. Applies to the openai-codex guard, the in-tree FAL no-edit-endpoint error, and the dynamic tool-schema text-only line.
393 lines
13 KiB
Python
393 lines
13 KiB
Python
"""
|
|
Image Generation Provider ABC
|
|
=============================
|
|
|
|
Defines the pluggable-backend interface for image generation. Providers register
|
|
instances via ``PluginContext.register_image_gen_provider()``; the active one
|
|
(selected via ``image_gen.provider`` in ``config.yaml``) services every
|
|
``image_generate`` tool call.
|
|
|
|
Providers live in ``<repo>/plugins/image_gen/<name>/`` (built-in, auto-loaded
|
|
as ``kind: backend``) or ``~/.hermes/plugins/image_gen/<name>/`` (user, opt-in
|
|
via ``plugins.enabled``).
|
|
|
|
Unified surface
|
|
---------------
|
|
One tool — ``image_generate`` — covers **text-to-image** and
|
|
**image-to-image / image editing**. The router is the presence of
|
|
``image_url`` (and/or ``reference_image_urls``): if any source image is
|
|
provided, the provider routes to its image-to-image / edit endpoint; if
|
|
omitted, the provider routes to text-to-image. Users pick one **model**
|
|
(e.g. nano-banana-pro, gpt-image-2, grok-imagine-image); the provider
|
|
handles which underlying endpoint to hit. This mirrors the ``video_gen``
|
|
provider design (``agent/video_gen_provider.py``) so the two surfaces
|
|
stay learnable together.
|
|
|
|
Response shape
|
|
--------------
|
|
All providers return a dict that :func:`success_response` / :func:`error_response`
|
|
produce. The tool wrapper JSON-serializes it. Keys:
|
|
|
|
success bool
|
|
image str | None URL or absolute file path
|
|
model str provider-specific model identifier
|
|
prompt str echoed prompt
|
|
aspect_ratio str "landscape" | "square" | "portrait"
|
|
modality str "text" | "image" (which mode was used)
|
|
provider str provider name (for diagnostics)
|
|
error str only when success=False
|
|
error_type str only when success=False
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import abc
|
|
import base64
|
|
import datetime
|
|
import logging
|
|
import uuid
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
VALID_ASPECT_RATIOS: Tuple[str, ...] = ("landscape", "square", "portrait")
|
|
DEFAULT_ASPECT_RATIO = "landscape"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ABC
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class ImageGenProvider(abc.ABC):
|
|
"""Abstract base class for an image generation backend.
|
|
|
|
Subclasses must implement :meth:`generate`. Everything else has sane
|
|
defaults — override only what your provider needs.
|
|
"""
|
|
|
|
@property
|
|
@abc.abstractmethod
|
|
def name(self) -> str:
|
|
"""Stable short identifier used in ``image_gen.provider`` config.
|
|
|
|
Lowercase, no spaces. Examples: ``fal``, ``openai``, ``replicate``.
|
|
"""
|
|
|
|
@property
|
|
def display_name(self) -> str:
|
|
"""Human-readable label shown in ``hermes tools``. Defaults to ``name.title()``."""
|
|
return self.name.title()
|
|
|
|
def is_available(self) -> bool:
|
|
"""Return True when this provider can service calls.
|
|
|
|
Typically checks for a required API key. Default: True
|
|
(providers with no external dependencies are always available).
|
|
"""
|
|
return True
|
|
|
|
def list_models(self) -> List[Dict[str, Any]]:
|
|
"""Return catalog entries for ``hermes tools`` model picker.
|
|
|
|
Each entry::
|
|
|
|
{
|
|
"id": "gpt-image-1.5", # required
|
|
"display": "GPT Image 1.5", # optional; defaults to id
|
|
"speed": "~10s", # optional
|
|
"strengths": "...", # optional
|
|
"price": "$...", # optional
|
|
}
|
|
|
|
Default: empty list (provider has no user-selectable models).
|
|
"""
|
|
return []
|
|
|
|
def get_setup_schema(self) -> Dict[str, Any]:
|
|
"""Return provider metadata for the ``hermes tools`` picker.
|
|
|
|
Used by ``tools_config.py`` to inject this provider as a row in
|
|
the Image Generation provider list. Shape::
|
|
|
|
{
|
|
"name": "OpenAI", # picker label
|
|
"badge": "paid", # optional short tag
|
|
"tag": "One-line description...", # optional subtitle
|
|
"env_vars": [ # keys to prompt for
|
|
{"key": "OPENAI_API_KEY",
|
|
"prompt": "OpenAI API key",
|
|
"url": "https://platform.openai.com/api-keys"},
|
|
],
|
|
}
|
|
|
|
Default: minimal entry derived from ``display_name``. Override to
|
|
expose API key prompts and custom badges.
|
|
"""
|
|
return {
|
|
"name": self.display_name,
|
|
"badge": "",
|
|
"tag": "",
|
|
"env_vars": [],
|
|
}
|
|
|
|
def default_model(self) -> Optional[str]:
|
|
"""Return the default model id, or None if not applicable."""
|
|
models = self.list_models()
|
|
if models:
|
|
return models[0].get("id")
|
|
return None
|
|
|
|
def capabilities(self) -> Dict[str, Any]:
|
|
"""Return what this provider supports.
|
|
|
|
Returned dict (all keys optional)::
|
|
|
|
{
|
|
"modalities": ["text", "image"], # which inputs the backend accepts
|
|
"max_reference_images": 9, # cap for reference_image_urls
|
|
}
|
|
|
|
``modalities`` declares whether the active backend/model supports
|
|
text-to-image (``"text"``), image-to-image / editing (``"image"``),
|
|
or both. The tool layer surfaces this in the dynamic schema so the
|
|
model knows when ``image_url`` is honored. Used by ``hermes tools``
|
|
for the picker too. Default: text-only (backward compatible — a
|
|
provider that doesn't override this advertises text-to-image only).
|
|
"""
|
|
return {
|
|
"modalities": ["text"],
|
|
"max_reference_images": 0,
|
|
}
|
|
|
|
@abc.abstractmethod
|
|
def generate(
|
|
self,
|
|
prompt: str,
|
|
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
|
*,
|
|
image_url: Optional[str] = None,
|
|
reference_image_urls: Optional[List[str]] = None,
|
|
**kwargs: Any,
|
|
) -> Dict[str, Any]:
|
|
"""Generate an image from a text prompt, or edit/transform a source image.
|
|
|
|
Routing: if ``image_url`` (or any ``reference_image_urls``) is
|
|
provided, the provider should route to its image-to-image / edit
|
|
endpoint; otherwise text-to-image. ``image_url`` is the primary
|
|
source image to edit; ``reference_image_urls`` are additional
|
|
style/composition references (provider clamps to its declared
|
|
``max_reference_images``).
|
|
|
|
Implementations should return the dict from :func:`success_response`
|
|
or :func:`error_response`. ``kwargs`` may contain forward-compat
|
|
parameters future versions of the schema will expose —
|
|
implementations MUST ignore unknown keys (no TypeError).
|
|
"""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def resolve_aspect_ratio(value: Optional[str]) -> str:
|
|
"""Clamp an aspect_ratio value to the valid set, defaulting to landscape.
|
|
|
|
Invalid values are coerced rather than rejected so the tool surface is
|
|
forgiving of agent mistakes.
|
|
"""
|
|
if not isinstance(value, str):
|
|
return DEFAULT_ASPECT_RATIO
|
|
v = value.strip().lower()
|
|
if v in VALID_ASPECT_RATIOS:
|
|
return v
|
|
return DEFAULT_ASPECT_RATIO
|
|
|
|
|
|
def normalize_reference_images(value: Any) -> Optional[List[str]]:
|
|
"""Coerce a reference-image argument into a clean list of URL/path strings.
|
|
|
|
Accepts a single string or a list; strips blanks and whitespace. Returns
|
|
``None`` when nothing usable remains so providers can treat "no refs" as a
|
|
single sentinel.
|
|
"""
|
|
if value is None:
|
|
return None
|
|
if isinstance(value, str):
|
|
value = [value]
|
|
if not isinstance(value, (list, tuple)):
|
|
return None
|
|
out: List[str] = []
|
|
for item in value:
|
|
if isinstance(item, str) and item.strip():
|
|
out.append(item.strip())
|
|
return out or None
|
|
|
|
|
|
def _images_cache_dir() -> Path:
|
|
"""Return ``$HERMES_HOME/cache/images/``, creating parents as needed."""
|
|
from hermes_constants import get_hermes_home
|
|
|
|
path = get_hermes_home() / "cache" / "images"
|
|
path.mkdir(parents=True, exist_ok=True)
|
|
return path
|
|
|
|
|
|
def save_b64_image(
|
|
b64_data: str,
|
|
*,
|
|
prefix: str = "image",
|
|
extension: str = "png",
|
|
) -> Path:
|
|
"""Decode base64 image data and write it under ``$HERMES_HOME/cache/images/``.
|
|
|
|
Returns the absolute :class:`Path` to the saved file.
|
|
|
|
Filename format: ``<prefix>_<YYYYMMDD_HHMMSS>_<short-uuid>.<ext>``.
|
|
"""
|
|
raw = base64.b64decode(b64_data)
|
|
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
short = uuid.uuid4().hex[:8]
|
|
path = _images_cache_dir() / f"{prefix}_{ts}_{short}.{extension}"
|
|
path.write_bytes(raw)
|
|
return path
|
|
|
|
|
|
# Extension inference for save_url_image — keep small and explicit. We don't
|
|
# want to import mimetypes for a handful of formats every image_gen provider
|
|
# actually returns, and we never want to inherit a content-type that points
|
|
# at HTML or JSON when the API gives us a degenerate response.
|
|
_URL_IMAGE_CONTENT_TYPES = {
|
|
"image/png": "png",
|
|
"image/jpeg": "jpg",
|
|
"image/jpg": "jpg",
|
|
"image/webp": "webp",
|
|
"image/gif": "gif",
|
|
}
|
|
|
|
|
|
def save_url_image(
|
|
url: str,
|
|
*,
|
|
prefix: str = "image",
|
|
timeout: float = 60.0,
|
|
max_bytes: int = 25 * 1024 * 1024,
|
|
) -> Path:
|
|
"""Download an image URL and write it under ``$HERMES_HOME/cache/images/``.
|
|
|
|
Used by providers (xAI, fallback OpenAI) whose API returns an *ephemeral*
|
|
URL instead of inline base64 — those URLs frequently expire before a
|
|
downstream consumer (Telegram ``send_photo``, browser fetch) can resolve
|
|
them, so we materialise the bytes locally at tool-completion time.
|
|
Mirrors :func:`save_b64_image`'s shape so providers can swap in one line.
|
|
|
|
Returns the absolute :class:`Path` to the saved file. Raises on any
|
|
network / HTTP / oversize / non-image-content-type error so callers can
|
|
fall back to returning the bare URL with a clear error message.
|
|
"""
|
|
import requests
|
|
|
|
response = requests.get(url, timeout=timeout, stream=True)
|
|
response.raise_for_status()
|
|
|
|
# Infer extension from the response content-type, falling back to the
|
|
# URL suffix when xAI / OpenAI omit a precise type (some CDNs return
|
|
# ``application/octet-stream``). Defaults to ``png``.
|
|
content_type = (response.headers.get("Content-Type") or "").split(";", 1)[0].strip().lower()
|
|
extension = _URL_IMAGE_CONTENT_TYPES.get(content_type)
|
|
if extension is None:
|
|
url_path = url.split("?", 1)[0].lower()
|
|
for ext in ("png", "jpg", "jpeg", "webp", "gif"):
|
|
if url_path.endswith(f".{ext}"):
|
|
extension = "jpg" if ext == "jpeg" else ext
|
|
break
|
|
if extension is None:
|
|
extension = "png"
|
|
|
|
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
short = uuid.uuid4().hex[:8]
|
|
path = _images_cache_dir() / f"{prefix}_{ts}_{short}.{extension}"
|
|
|
|
bytes_written = 0
|
|
with path.open("wb") as fh:
|
|
for chunk in response.iter_content(chunk_size=64 * 1024):
|
|
if not chunk:
|
|
continue
|
|
bytes_written += len(chunk)
|
|
if bytes_written > max_bytes:
|
|
fh.close()
|
|
try:
|
|
path.unlink()
|
|
except OSError:
|
|
pass
|
|
raise ValueError(
|
|
f"Image at {url} exceeds {max_bytes // (1024 * 1024)}MB cap; refusing to cache."
|
|
)
|
|
fh.write(chunk)
|
|
|
|
if bytes_written == 0:
|
|
try:
|
|
path.unlink()
|
|
except OSError:
|
|
pass
|
|
raise ValueError(f"Image at {url} returned 0 bytes; refusing to cache.")
|
|
|
|
return path
|
|
|
|
|
|
def success_response(
|
|
*,
|
|
image: str,
|
|
model: str,
|
|
prompt: str,
|
|
aspect_ratio: str,
|
|
provider: str,
|
|
modality: str = "text",
|
|
extra: Optional[Dict[str, Any]] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Build a uniform success response dict.
|
|
|
|
``image`` may be an HTTP URL or an absolute filesystem path (for b64
|
|
providers like OpenAI). ``modality`` is ``"text"`` (text-to-image) or
|
|
``"image"`` (image-to-image / editing) — indicates which endpoint was
|
|
actually hit, useful for diagnostics. Callers that need to pass through
|
|
additional backend-specific fields can supply ``extra``.
|
|
"""
|
|
payload: Dict[str, Any] = {
|
|
"success": True,
|
|
"image": image,
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"aspect_ratio": aspect_ratio,
|
|
"modality": modality,
|
|
"provider": provider,
|
|
}
|
|
if extra:
|
|
for k, v in extra.items():
|
|
payload.setdefault(k, v)
|
|
return payload
|
|
|
|
|
|
def error_response(
|
|
*,
|
|
error: str,
|
|
error_type: str = "provider_error",
|
|
provider: str = "",
|
|
model: str = "",
|
|
prompt: str = "",
|
|
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
|
) -> Dict[str, Any]:
|
|
"""Build a uniform error response dict."""
|
|
return {
|
|
"success": False,
|
|
"image": None,
|
|
"error": error,
|
|
"error_type": error_type,
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"aspect_ratio": aspect_ratio,
|
|
"provider": provider,
|
|
}
|