mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-20 10:11:58 +00:00
feat(image-gen): add image-to-image / editing to image_generate (#48705)
* feat(image-gen): add image-to-image / editing to image_generate Brings image generation to parity with video generation: the unified image_generate tool now edits/transforms a source image (image-to-image) when given image_url / reference_image_urls, routing to each backend's edit endpoint, exactly as video_generate routes to image-to-video. - ImageGenProvider ABC: generate() gains keyword-only image_url + reference_image_urls; new capabilities() declares modalities + max_reference_images (defaults to text-only, backward compatible). success_response gains a modality field; adds normalize_reference_images. - image_generate tool: schema exposes image_url + reference_image_urls; dynamic schema reflects the active model's actual edit capability so the agent knows when image_url is honored. Handler + plugin dispatch forward the new inputs; legacy/text-only providers get a clear modality_unsupported error instead of silently dropping the source image. - In-tree FAL: 7 models gain edit endpoints (flux-2-klein, flux-2-pro, nano-banana-pro, gpt-image-1.5, gpt-image-2, ideogram/v3, qwen-image) with per-model edit_supports whitelists + reference caps; routes to the /edit endpoint and skips the upscaler for edits. - Plugins: openai (images.edit, 16 refs), xai (/v1/images/edits via grok-imagine-image-quality, JSON body per xAI docs), krea (image_style_references, 10 refs). openai-codex stays text-only and rejects edits with an actionable error. - Tests: 15 new (payload, routing, dispatch forwarding, dynamic schema, capabilities); updated 2 change-detector/lambda tests for the new schema. - Docs: image-generation feature page, image-gen provider plugin guide, tools reference. * fix(image-gen): preserve legacy passthrough in fal/krea plugin tests Two existing plugin tests asserted pre-image-to-image behavior: - fal: forward image_url/reference_image_urls only when supplied, so a text-to-image delegation stays byte-identical (no None kwargs). - krea: keep dict-shaped image_style_references refs verbatim (the unified string refs go through normalize_reference_images; legacy non-string ref objects pass through unchanged) — fixes KeyError when callers pass the richer Krea ref-object shape. * fix(image-gen): clearer not-capable message for text-to-image-only models When a text-to-image-only model (incl. gpt-image-2 on the Codex OAuth path, which can't do editing through the Responses image_generation tool) gets a source image, say 'this model is not capable of image-to-image / editing — provide a text-only prompt' rather than sending the user shopping for other backends. Applies to the openai-codex guard, the in-tree FAL no-edit-endpoint error, and the dynamic tool-schema text-only line.
This commit is contained in:
parent
cfb55de5ea
commit
c02192ff6a
13 changed files with 1239 additions and 106 deletions
|
|
@ -47,6 +47,7 @@ from agent.image_gen_provider import (
|
|||
DEFAULT_ASPECT_RATIO,
|
||||
ImageGenProvider,
|
||||
error_response,
|
||||
normalize_reference_images,
|
||||
resolve_aspect_ratio,
|
||||
save_b64_image,
|
||||
success_response,
|
||||
|
|
@ -112,10 +113,20 @@ class MyBackendImageGenProvider(ImageGenProvider):
|
|||
],
|
||||
}
|
||||
|
||||
def capabilities(self) -> Dict[str, Any]:
|
||||
# Declare whether this backend supports image-to-image / editing.
|
||||
# The tool layer surfaces this in the dynamic schema so the model
|
||||
# knows when `image_url` is honored. Default (if you omit this) is
|
||||
# text-only: {"modalities": ["text"], "max_reference_images": 0}.
|
||||
return {"modalities": ["text", "image"], "max_reference_images": 4}
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
||||
*,
|
||||
image_url: Optional[str] = None,
|
||||
reference_image_urls: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
prompt = (prompt or "").strip()
|
||||
|
|
@ -130,6 +141,15 @@ class MyBackendImageGenProvider(ImageGenProvider):
|
|||
aspect_ratio=aspect_ratio,
|
||||
)
|
||||
|
||||
# Routing: if image_url (or reference_image_urls) is set, the call is
|
||||
# an image-to-image / edit request; otherwise text-to-image. Report
|
||||
# which path you took via the `modality` field of success_response.
|
||||
sources = []
|
||||
if image_url:
|
||||
sources.append(image_url)
|
||||
sources.extend(normalize_reference_images(reference_image_urls) or [])
|
||||
modality = "image" if sources else "text"
|
||||
|
||||
# Model selection precedence: env var → config → default. The helper
|
||||
# _resolve_model() in the built-in openai plugin is a good reference.
|
||||
model_id = kwargs.get("model") or self.default_model() or "my-model-fast"
|
||||
|
|
@ -137,11 +157,18 @@ class MyBackendImageGenProvider(ImageGenProvider):
|
|||
try:
|
||||
import my_backend_sdk
|
||||
client = my_backend_sdk.Client(api_key=os.environ["MY_BACKEND_API_KEY"])
|
||||
result = client.generate(
|
||||
prompt=prompt,
|
||||
model=model_id,
|
||||
aspect_ratio=aspect_ratio,
|
||||
)
|
||||
if modality == "image":
|
||||
result = client.edit(
|
||||
prompt=prompt,
|
||||
model=model_id,
|
||||
image_urls=sources,
|
||||
)
|
||||
else:
|
||||
result = client.generate(
|
||||
prompt=prompt,
|
||||
model=model_id,
|
||||
aspect_ratio=aspect_ratio,
|
||||
)
|
||||
|
||||
# Two shapes supported:
|
||||
# - URL string: return it as `image`
|
||||
|
|
@ -162,6 +189,7 @@ class MyBackendImageGenProvider(ImageGenProvider):
|
|||
prompt=prompt,
|
||||
aspect_ratio=aspect_ratio,
|
||||
provider=self.name,
|
||||
modality=modality,
|
||||
)
|
||||
except Exception as exc:
|
||||
return error_response(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue