mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
feat(image-gen): add image-to-image / editing to image_generate (#48705)
* feat(image-gen): add image-to-image / editing to image_generate Brings image generation to parity with video generation: the unified image_generate tool now edits/transforms a source image (image-to-image) when given image_url / reference_image_urls, routing to each backend's edit endpoint, exactly as video_generate routes to image-to-video. - ImageGenProvider ABC: generate() gains keyword-only image_url + reference_image_urls; new capabilities() declares modalities + max_reference_images (defaults to text-only, backward compatible). success_response gains a modality field; adds normalize_reference_images. - image_generate tool: schema exposes image_url + reference_image_urls; dynamic schema reflects the active model's actual edit capability so the agent knows when image_url is honored. Handler + plugin dispatch forward the new inputs; legacy/text-only providers get a clear modality_unsupported error instead of silently dropping the source image. - In-tree FAL: 7 models gain edit endpoints (flux-2-klein, flux-2-pro, nano-banana-pro, gpt-image-1.5, gpt-image-2, ideogram/v3, qwen-image) with per-model edit_supports whitelists + reference caps; routes to the /edit endpoint and skips the upscaler for edits. - Plugins: openai (images.edit, 16 refs), xai (/v1/images/edits via grok-imagine-image-quality, JSON body per xAI docs), krea (image_style_references, 10 refs). openai-codex stays text-only and rejects edits with an actionable error. - Tests: 15 new (payload, routing, dispatch forwarding, dynamic schema, capabilities); updated 2 change-detector/lambda tests for the new schema. - Docs: image-generation feature page, image-gen provider plugin guide, tools reference. * fix(image-gen): preserve legacy passthrough in fal/krea plugin tests Two existing plugin tests asserted pre-image-to-image behavior: - fal: forward image_url/reference_image_urls only when supplied, so a text-to-image delegation stays byte-identical (no None kwargs). - krea: keep dict-shaped image_style_references refs verbatim (the unified string refs go through normalize_reference_images; legacy non-string ref objects pass through unchanged) — fixes KeyError when callers pass the richer Krea ref-object shape. * fix(image-gen): clearer not-capable message for text-to-image-only models When a text-to-image-only model (incl. gpt-image-2 on the Codex OAuth path, which can't do editing through the Responses image_generation tool) gets a source image, say 'this model is not capable of image-to-image / editing — provide a text-only prompt' rather than sending the user shopping for other backends. Applies to the openai-codex guard, the in-tree FAL no-edit-endpoint error, and the dynamic tool-schema text-only line.
This commit is contained in:
parent
cfb55de5ea
commit
c02192ff6a
13 changed files with 1239 additions and 106 deletions
|
|
@ -11,6 +11,18 @@ Providers live in ``<repo>/plugins/image_gen/<name>/`` (built-in, auto-loaded
|
|||
as ``kind: backend``) or ``~/.hermes/plugins/image_gen/<name>/`` (user, opt-in
|
||||
via ``plugins.enabled``).
|
||||
|
||||
Unified surface
|
||||
---------------
|
||||
One tool — ``image_generate`` — covers **text-to-image** and
|
||||
**image-to-image / image editing**. The router is the presence of
|
||||
``image_url`` (and/or ``reference_image_urls``): if any source image is
|
||||
provided, the provider routes to its image-to-image / edit endpoint; if
|
||||
omitted, the provider routes to text-to-image. Users pick one **model**
|
||||
(e.g. nano-banana-pro, gpt-image-2, grok-imagine-image); the provider
|
||||
handles which underlying endpoint to hit. This mirrors the ``video_gen``
|
||||
provider design (``agent/video_gen_provider.py``) so the two surfaces
|
||||
stay learnable together.
|
||||
|
||||
Response shape
|
||||
--------------
|
||||
All providers return a dict that :func:`success_response` / :func:`error_response`
|
||||
|
|
@ -21,6 +33,7 @@ produce. The tool wrapper JSON-serializes it. Keys:
|
|||
model str provider-specific model identifier
|
||||
prompt str echoed prompt
|
||||
aspect_ratio str "landscape" | "square" | "portrait"
|
||||
modality str "text" | "image" (which mode was used)
|
||||
provider str provider name (for diagnostics)
|
||||
error str only when success=False
|
||||
error_type str only when success=False
|
||||
|
|
@ -127,19 +140,51 @@ class ImageGenProvider(abc.ABC):
|
|||
return models[0].get("id")
|
||||
return None
|
||||
|
||||
def capabilities(self) -> Dict[str, Any]:
|
||||
"""Return what this provider supports.
|
||||
|
||||
Returned dict (all keys optional)::
|
||||
|
||||
{
|
||||
"modalities": ["text", "image"], # which inputs the backend accepts
|
||||
"max_reference_images": 9, # cap for reference_image_urls
|
||||
}
|
||||
|
||||
``modalities`` declares whether the active backend/model supports
|
||||
text-to-image (``"text"``), image-to-image / editing (``"image"``),
|
||||
or both. The tool layer surfaces this in the dynamic schema so the
|
||||
model knows when ``image_url`` is honored. Used by ``hermes tools``
|
||||
for the picker too. Default: text-only (backward compatible — a
|
||||
provider that doesn't override this advertises text-to-image only).
|
||||
"""
|
||||
return {
|
||||
"modalities": ["text"],
|
||||
"max_reference_images": 0,
|
||||
}
|
||||
|
||||
@abc.abstractmethod
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
||||
*,
|
||||
image_url: Optional[str] = None,
|
||||
reference_image_urls: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate an image.
|
||||
"""Generate an image from a text prompt, or edit/transform a source image.
|
||||
|
||||
Routing: if ``image_url`` (or any ``reference_image_urls``) is
|
||||
provided, the provider should route to its image-to-image / edit
|
||||
endpoint; otherwise text-to-image. ``image_url`` is the primary
|
||||
source image to edit; ``reference_image_urls`` are additional
|
||||
style/composition references (provider clamps to its declared
|
||||
``max_reference_images``).
|
||||
|
||||
Implementations should return the dict from :func:`success_response`
|
||||
or :func:`error_response`. ``kwargs`` may contain forward-compat
|
||||
parameters future versions of the schema will expose — implementations
|
||||
should ignore unknown keys.
|
||||
parameters future versions of the schema will expose —
|
||||
implementations MUST ignore unknown keys (no TypeError).
|
||||
"""
|
||||
|
||||
|
||||
|
|
@ -162,6 +207,26 @@ def resolve_aspect_ratio(value: Optional[str]) -> str:
|
|||
return DEFAULT_ASPECT_RATIO
|
||||
|
||||
|
||||
def normalize_reference_images(value: Any) -> Optional[List[str]]:
|
||||
"""Coerce a reference-image argument into a clean list of URL/path strings.
|
||||
|
||||
Accepts a single string or a list; strips blanks and whitespace. Returns
|
||||
``None`` when nothing usable remains so providers can treat "no refs" as a
|
||||
single sentinel.
|
||||
"""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str):
|
||||
value = [value]
|
||||
if not isinstance(value, (list, tuple)):
|
||||
return None
|
||||
out: List[str] = []
|
||||
for item in value:
|
||||
if isinstance(item, str) and item.strip():
|
||||
out.append(item.strip())
|
||||
return out or None
|
||||
|
||||
|
||||
def _images_cache_dir() -> Path:
|
||||
"""Return ``$HERMES_HOME/cache/images/``, creating parents as needed."""
|
||||
from hermes_constants import get_hermes_home
|
||||
|
|
@ -280,13 +345,16 @@ def success_response(
|
|||
prompt: str,
|
||||
aspect_ratio: str,
|
||||
provider: str,
|
||||
modality: str = "text",
|
||||
extra: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Build a uniform success response dict.
|
||||
|
||||
``image`` may be an HTTP URL or an absolute filesystem path (for b64
|
||||
providers like OpenAI). Callers that need to pass through additional
|
||||
backend-specific fields can supply ``extra``.
|
||||
providers like OpenAI). ``modality`` is ``"text"`` (text-to-image) or
|
||||
``"image"`` (image-to-image / editing) — indicates which endpoint was
|
||||
actually hit, useful for diagnostics. Callers that need to pass through
|
||||
additional backend-specific fields can supply ``extra``.
|
||||
"""
|
||||
payload: Dict[str, Any] = {
|
||||
"success": True,
|
||||
|
|
@ -294,6 +362,7 @@ def success_response(
|
|||
"model": model,
|
||||
"prompt": prompt,
|
||||
"aspect_ratio": aspect_ratio,
|
||||
"modality": modality,
|
||||
"provider": provider,
|
||||
}
|
||||
if extra:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue