diff --git a/agent/image_gen_provider.py b/agent/image_gen_provider.py index a7f1b8c31ff..a3eeb1e4c8c 100644 --- a/agent/image_gen_provider.py +++ b/agent/image_gen_provider.py @@ -11,6 +11,18 @@ Providers live in ``/plugins/image_gen//`` (built-in, auto-loaded as ``kind: backend``) or ``~/.hermes/plugins/image_gen//`` (user, opt-in via ``plugins.enabled``). +Unified surface +--------------- +One tool — ``image_generate`` — covers **text-to-image** and +**image-to-image / image editing**. The router is the presence of +``image_url`` (and/or ``reference_image_urls``): if any source image is +provided, the provider routes to its image-to-image / edit endpoint; if +omitted, the provider routes to text-to-image. Users pick one **model** +(e.g. nano-banana-pro, gpt-image-2, grok-imagine-image); the provider +handles which underlying endpoint to hit. This mirrors the ``video_gen`` +provider design (``agent/video_gen_provider.py``) so the two surfaces +stay learnable together. + Response shape -------------- All providers return a dict that :func:`success_response` / :func:`error_response` @@ -21,6 +33,7 @@ produce. The tool wrapper JSON-serializes it. Keys: model str provider-specific model identifier prompt str echoed prompt aspect_ratio str "landscape" | "square" | "portrait" + modality str "text" | "image" (which mode was used) provider str provider name (for diagnostics) error str only when success=False error_type str only when success=False @@ -127,19 +140,51 @@ class ImageGenProvider(abc.ABC): return models[0].get("id") return None + def capabilities(self) -> Dict[str, Any]: + """Return what this provider supports. + + Returned dict (all keys optional):: + + { + "modalities": ["text", "image"], # which inputs the backend accepts + "max_reference_images": 9, # cap for reference_image_urls + } + + ``modalities`` declares whether the active backend/model supports + text-to-image (``"text"``), image-to-image / editing (``"image"``), + or both. The tool layer surfaces this in the dynamic schema so the + model knows when ``image_url`` is honored. Used by ``hermes tools`` + for the picker too. Default: text-only (backward compatible — a + provider that doesn't override this advertises text-to-image only). + """ + return { + "modalities": ["text"], + "max_reference_images": 0, + } + @abc.abstractmethod def generate( self, prompt: str, aspect_ratio: str = DEFAULT_ASPECT_RATIO, + *, + image_url: Optional[str] = None, + reference_image_urls: Optional[List[str]] = None, **kwargs: Any, ) -> Dict[str, Any]: - """Generate an image. + """Generate an image from a text prompt, or edit/transform a source image. + + Routing: if ``image_url`` (or any ``reference_image_urls``) is + provided, the provider should route to its image-to-image / edit + endpoint; otherwise text-to-image. ``image_url`` is the primary + source image to edit; ``reference_image_urls`` are additional + style/composition references (provider clamps to its declared + ``max_reference_images``). Implementations should return the dict from :func:`success_response` or :func:`error_response`. ``kwargs`` may contain forward-compat - parameters future versions of the schema will expose — implementations - should ignore unknown keys. + parameters future versions of the schema will expose — + implementations MUST ignore unknown keys (no TypeError). """ @@ -162,6 +207,26 @@ def resolve_aspect_ratio(value: Optional[str]) -> str: return DEFAULT_ASPECT_RATIO +def normalize_reference_images(value: Any) -> Optional[List[str]]: + """Coerce a reference-image argument into a clean list of URL/path strings. + + Accepts a single string or a list; strips blanks and whitespace. Returns + ``None`` when nothing usable remains so providers can treat "no refs" as a + single sentinel. + """ + if value is None: + return None + if isinstance(value, str): + value = [value] + if not isinstance(value, (list, tuple)): + return None + out: List[str] = [] + for item in value: + if isinstance(item, str) and item.strip(): + out.append(item.strip()) + return out or None + + def _images_cache_dir() -> Path: """Return ``$HERMES_HOME/cache/images/``, creating parents as needed.""" from hermes_constants import get_hermes_home @@ -280,13 +345,16 @@ def success_response( prompt: str, aspect_ratio: str, provider: str, + modality: str = "text", extra: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Build a uniform success response dict. ``image`` may be an HTTP URL or an absolute filesystem path (for b64 - providers like OpenAI). Callers that need to pass through additional - backend-specific fields can supply ``extra``. + providers like OpenAI). ``modality`` is ``"text"`` (text-to-image) or + ``"image"`` (image-to-image / editing) — indicates which endpoint was + actually hit, useful for diagnostics. Callers that need to pass through + additional backend-specific fields can supply ``extra``. """ payload: Dict[str, Any] = { "success": True, @@ -294,6 +362,7 @@ def success_response( "model": model, "prompt": prompt, "aspect_ratio": aspect_ratio, + "modality": modality, "provider": provider, } if extra: diff --git a/plugins/image_gen/fal/__init__.py b/plugins/image_gen/fal/__init__.py index 21b88f37f34..3e7777c7149 100644 --- a/plugins/image_gen/fal/__init__.py +++ b/plugins/image_gen/fal/__init__.py @@ -87,7 +87,7 @@ class FalImageGenProvider(ImageGenProvider): return { "name": "FAL.ai", "badge": "paid", - "tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc.", + "tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc. — text-to-image & image editing", "env_vars": [ { "key": "FAL_KEY", @@ -97,18 +97,40 @@ class FalImageGenProvider(ImageGenProvider): ], } + def capabilities(self) -> Dict[str, Any]: + # Whether image-to-image is available depends on the currently- + # selected FAL model (each model entry declares an edit_endpoint or + # not). Report the active model's actual surface so the dynamic tool + # schema is accurate. + import tools.image_generation_tool as _it + + try: + _model_id, meta = _it._resolve_fal_model() + except Exception: # noqa: BLE001 + return {"modalities": ["text"], "max_reference_images": 0} + if meta.get("edit_endpoint"): + return { + "modalities": ["text", "image"], + "max_reference_images": int(meta.get("max_reference_images") or 1), + } + return {"modalities": ["text"], "max_reference_images": 0} + def generate( self, prompt: str, aspect_ratio: str = DEFAULT_ASPECT_RATIO, + *, + image_url: Optional[str] = None, + reference_image_urls: Optional[List[str]] = None, **kwargs: Any, ) -> Dict[str, Any]: - """Generate an image via the legacy FAL pipeline. + """Generate or edit an image via the legacy FAL pipeline. - Forwards prompt + aspect_ratio (and any forward-compat extras - the schema supports) into :func:`tools.image_generation_tool.image_generate_tool`, - then reshapes its JSON-string response into the provider-ABC - dict format consumed by ``_dispatch_to_plugin_provider``. + Forwards prompt + aspect_ratio + image_url/reference_image_urls (and + any forward-compat extras the schema supports) into + :func:`tools.image_generation_tool.image_generate_tool`, then reshapes + its JSON-string response into the provider-ABC dict format consumed by + ``_dispatch_to_plugin_provider``. """ import tools.image_generation_tool as _it @@ -124,6 +146,13 @@ class FalImageGenProvider(ImageGenProvider): ) if key in kwargs and kwargs[key] is not None } + # Only forward the image-to-image inputs when actually supplied, so a + # plain text-to-image call delegates exactly as it did before (no + # noisy None kwargs). + if image_url is not None: + passthrough["image_url"] = image_url + if reference_image_urls is not None: + passthrough["reference_image_urls"] = reference_image_urls try: raw = _it.image_generate_tool( diff --git a/plugins/image_gen/krea/__init__.py b/plugins/image_gen/krea/__init__.py index 552f2ae71fe..a897302175b 100644 --- a/plugins/image_gen/krea/__init__.py +++ b/plugins/image_gen/krea/__init__.py @@ -33,6 +33,7 @@ from agent.image_gen_provider import ( DEFAULT_ASPECT_RATIO, ImageGenProvider, error_response, + normalize_reference_images, resolve_aspect_ratio, save_url_image, success_response, @@ -191,7 +192,7 @@ class KreaImageGenProvider(ImageGenProvider): return { "name": "Krea", "badge": "paid", - "tag": "Krea 2 foundation model — Medium ($0.03) + Large ($0.06). Strong style transfer + moodboards.", + "tag": "Krea 2 foundation model — Medium ($0.03) + Large ($0.06). Style transfer, moodboards, reference-guided generation.", "env_vars": [ { "key": "KREA_API_KEY", @@ -201,6 +202,11 @@ class KreaImageGenProvider(ImageGenProvider): ], } + def capabilities(self) -> Dict[str, Any]: + # Krea supports reference-guided generation (image-to-image style + # transfer) via image_style_references — up to 10 refs. + return {"modalities": ["text", "image"], "max_reference_images": 10} + # ------------------------------------------------------------------ # generate() # ------------------------------------------------------------------ @@ -209,12 +215,48 @@ class KreaImageGenProvider(ImageGenProvider): self, prompt: str, aspect_ratio: str = DEFAULT_ASPECT_RATIO, + *, + image_url: Optional[str] = None, + reference_image_urls: Optional[List[str]] = None, **kwargs: Any, ) -> Dict[str, Any]: prompt = (prompt or "").strip() aspect = resolve_aspect_ratio(aspect_ratio) krea_ar = _ASPECT_MAP.get(aspect, "1:1") + # Collect reference images for reference-guided generation (image-to- + # image style transfer). Sources, in order: + # 1. unified image_url (primary source) + reference_image_urls (strings) + # 2. legacy image_style_references kwarg — may be plain URL strings OR + # Krea's richer ref objects (e.g. {"url": ..., "strength": ...}), + # which are passed through verbatim for backward compatibility. + style_refs: List[Any] = [] + if isinstance(image_url, str) and image_url.strip(): + style_refs.append(image_url.strip()) + for ref in (normalize_reference_images(reference_image_urls) or []): + style_refs.append(ref) + legacy_refs = kwargs.get("image_style_references") + if isinstance(legacy_refs, list): + for ref in legacy_refs: + if isinstance(ref, str): + if ref.strip(): + style_refs.append(ref.strip()) + elif ref: + # Non-string ref object (dict, etc.) — pass through as-is. + style_refs.append(ref) + # Dedupe string entries while preserving order (dict refs aren't + # hashable, so they're kept verbatim); Krea caps at 10. + seen: set = set() + deduped: List[Any] = [] + for r in style_refs: + if isinstance(r, str): + if r in seen: + continue + seen.add(r) + deduped.append(r) + style_refs = deduped[:10] + modality = "image" if style_refs else "text" + if not prompt: return error_response( error="Prompt is required and must be a non-empty string", @@ -256,10 +298,10 @@ class KreaImageGenProvider(ImageGenProvider): if isinstance(styles, list) and styles: payload["styles"] = styles - image_style_references = kwargs.get("image_style_references") - if isinstance(image_style_references, list) and image_style_references: - # Krea caps at 10 refs per request. - payload["image_style_references"] = image_style_references[:10] + if style_refs: + # Reference-guided generation (image-to-image style transfer). + # Krea caps at 10 refs per request (already clamped above). + payload["image_style_references"] = style_refs moodboards = kwargs.get("moodboards") if isinstance(moodboards, list) and moodboards: @@ -483,19 +525,19 @@ class KreaImageGenProvider(ImageGenProvider): # Per Krea's job-lifecycle docs the completed payload exposes # ``result.urls`` (an array). Fall back to a single ``url`` field # for forward/backward compatibility. - image_url: Optional[str] = None + result_image_url: Optional[str] = None urls = result.get("urls") if isinstance(urls, list) and urls: for candidate in urls: if isinstance(candidate, str) and candidate.strip(): - image_url = candidate.strip() + result_image_url = candidate.strip() break - if image_url is None: + if result_image_url is None: single = result.get("url") if isinstance(single, str) and single.strip(): - image_url = single.strip() + result_image_url = single.strip() - if image_url is None: + if result_image_url is None: return error_response( error="Krea result contained no image URL", error_type="empty_response", @@ -508,14 +550,14 @@ class KreaImageGenProvider(ImageGenProvider): # Materialise locally — Krea result URLs may expire, mirroring # what we do for xAI / OpenAI URL responses (#26942). try: - saved_path = save_url_image(image_url, prefix=f"krea_{model_id}") + saved_path = save_url_image(result_image_url, prefix=f"krea_{model_id}") except Exception as exc: # noqa: BLE001 logger.warning( "Krea image URL %s could not be cached (%s); falling back to bare URL.", - image_url, + result_image_url, exc, ) - image_ref = image_url + image_ref = result_image_url else: image_ref = str(saved_path) @@ -534,6 +576,7 @@ class KreaImageGenProvider(ImageGenProvider): prompt=prompt, aspect_ratio=aspect, provider="krea", + modality=modality, extra=extra, ) diff --git a/plugins/image_gen/openai-codex/__init__.py b/plugins/image_gen/openai-codex/__init__.py index 6fde2d60bbb..0bd61267db1 100644 --- a/plugins/image_gen/openai-codex/__init__.py +++ b/plugins/image_gen/openai-codex/__init__.py @@ -319,7 +319,7 @@ class OpenAICodexImageGenProvider(ImageGenProvider): return { "name": "OpenAI (Codex auth)", "badge": "free", - "tag": "gpt-image-2 via ChatGPT/Codex OAuth — no API key required", + "tag": "gpt-image-2 via ChatGPT/Codex OAuth — no API key required (text-to-image only)", "env_vars": [], "post_setup_hint": ( "Sign in with `hermes auth codex` (or `hermes setup` → Codex) " @@ -327,15 +327,41 @@ class OpenAICodexImageGenProvider(ImageGenProvider): ), } + def capabilities(self) -> Dict[str, Any]: + # The Codex Responses image_generation tool path is text-to-image + # only here. Image-to-image / editing via Codex OAuth is not wired — + # users who need editing should use the `openai` (API key), `fal`, or + # `xai` backends. Declaring text-only keeps the dynamic tool schema + # honest so the model doesn't attempt an unsupported edit. + return {"modalities": ["text"], "max_reference_images": 0} + def generate( self, prompt: str, aspect_ratio: str = DEFAULT_ASPECT_RATIO, + *, + image_url: Optional[str] = None, + reference_image_urls: Optional[List[str]] = None, **kwargs: Any, ) -> Dict[str, Any]: prompt = (prompt or "").strip() aspect = resolve_aspect_ratio(aspect_ratio) + # Image-to-image / editing is not supported on the Codex OAuth path. + # Surface a clear, actionable error instead of silently ignoring the + # source image and producing an unrelated picture. + if (isinstance(image_url, str) and image_url.strip()) or reference_image_urls: + return error_response( + error=( + "This model is not capable of image-to-image / editing. " + "Please provide a text-only prompt (drop image_url and " + "reference_image_urls)." + ), + error_type="modality_unsupported", + provider="openai-codex", + aspect_ratio=aspect, + ) + if not prompt: return error_response( error="Prompt is required and must be a non-empty string", diff --git a/plugins/image_gen/openai/__init__.py b/plugins/image_gen/openai/__init__.py index 448f5bc45af..e214271bcd9 100644 --- a/plugins/image_gen/openai/__init__.py +++ b/plugins/image_gen/openai/__init__.py @@ -31,6 +31,7 @@ from agent.image_gen_provider import ( DEFAULT_ASPECT_RATIO, ImageGenProvider, error_response, + normalize_reference_images, resolve_aspect_ratio, save_b64_image, save_url_image, @@ -117,13 +118,48 @@ def _resolve_model() -> Tuple[str, Dict[str, Any]]: return DEFAULT_MODEL, _MODELS[DEFAULT_MODEL] +# --------------------------------------------------------------------------- +# Source-image loading (for image-to-image / edit) +# --------------------------------------------------------------------------- + + +def _load_image_bytes(ref: str) -> Tuple[bytes, str]: + """Load image bytes from a URL or local file path. + + Returns ``(data, filename)``. Raises on any network / IO error so the + caller can surface a clean error_response. + """ + ref = ref.strip() + lower = ref.lower() + if lower.startswith(("http://", "https://")): + import requests + + resp = requests.get(ref, timeout=60) + resp.raise_for_status() + name = ref.split("?", 1)[0].rsplit("/", 1)[-1] or "image.png" + return resp.content, name + if lower.startswith("data:"): + import base64 + + header, _, b64 = ref.partition(",") + ext = "png" + if "image/" in header: + ext = header.split("image/", 1)[1].split(";", 1)[0] or "png" + return base64.b64decode(b64), f"image.{ext}" + # Local file path. + with open(ref, "rb") as fh: + data = fh.read() + name = os.path.basename(ref) or "image.png" + return data, name + + # --------------------------------------------------------------------------- # Provider # --------------------------------------------------------------------------- class OpenAIImageGenProvider(ImageGenProvider): - """OpenAI ``images.generate`` backend — gpt-image-2 at low/medium/high.""" + """OpenAI ``images.generate`` / ``images.edit`` backend — gpt-image-2.""" @property def name(self) -> str: @@ -161,7 +197,7 @@ class OpenAIImageGenProvider(ImageGenProvider): return { "name": "OpenAI", "badge": "paid", - "tag": "gpt-image-2 at low/medium/high quality tiers", + "tag": "gpt-image-2 at low/medium/high quality tiers — text-to-image & image editing", "env_vars": [ { "key": "OPENAI_API_KEY", @@ -171,10 +207,18 @@ class OpenAIImageGenProvider(ImageGenProvider): ], } + def capabilities(self) -> Dict[str, Any]: + # gpt-image-2 supports editing via images.edit() with up to 16 source + # images. + return {"modalities": ["text", "image"], "max_reference_images": 16} + def generate( self, prompt: str, aspect_ratio: str = DEFAULT_ASPECT_RATIO, + *, + image_url: Optional[str] = None, + reference_image_urls: Optional[List[str]] = None, **kwargs: Any, ) -> Dict[str, Any]: prompt = (prompt or "").strip() @@ -213,29 +257,82 @@ class OpenAIImageGenProvider(ImageGenProvider): tier_id, meta = _resolve_model() size = _SIZES.get(aspect, _SIZES["square"]) - # gpt-image-2 returns b64_json unconditionally and REJECTS - # ``response_format`` as an unknown parameter. Don't send it. - payload: Dict[str, Any] = { - "model": API_MODEL, - "prompt": prompt, - "size": size, - "n": 1, - "quality": meta["quality"], - } + # Collect source images (primary + references) for image-to-image. + sources: List[str] = [] + if isinstance(image_url, str) and image_url.strip(): + sources.append(image_url.strip()) + for ref in (normalize_reference_images(reference_image_urls) or []): + sources.append(ref) + sources = sources[:16] # gpt-image-2 edit caps at 16 images + is_edit = bool(sources) + modality = "image" if is_edit else "text" - try: - client = openai.OpenAI() - response = client.images.generate(**payload) - except Exception as exc: - logger.debug("OpenAI image generation failed", exc_info=True) - return error_response( - error=f"OpenAI image generation failed: {exc}", - error_type="api_error", - provider="openai", - model=tier_id, - prompt=prompt, - aspect_ratio=aspect, - ) + client = openai.OpenAI() + + if is_edit: + # images.edit() expects file-like objects. Download/read each + # source into a named BytesIO so the SDK sends correct multipart. + import io + + try: + files = [] + for ref in sources: + data, fname = _load_image_bytes(ref) + bio = io.BytesIO(data) + bio.name = fname + files.append(bio) + except Exception as exc: + return error_response( + error=f"Could not load source image for editing: {exc}", + error_type="io_error", + provider="openai", + model=tier_id, + prompt=prompt, + aspect_ratio=aspect, + ) + + try: + response = client.images.edit( + model=API_MODEL, + image=files if len(files) > 1 else files[0], + prompt=prompt, + size=size, # type: ignore[arg-type] # _SIZES values are valid gpt-image sizes + quality=meta["quality"], + n=1, + ) + except Exception as exc: + logger.debug("OpenAI image edit failed", exc_info=True) + return error_response( + error=f"OpenAI image editing failed: {exc}", + error_type="api_error", + provider="openai", + model=tier_id, + prompt=prompt, + aspect_ratio=aspect, + ) + else: + # gpt-image-2 returns b64_json unconditionally and REJECTS + # ``response_format`` as an unknown parameter. Don't send it. + payload: Dict[str, Any] = { + "model": API_MODEL, + "prompt": prompt, + "size": size, + "n": 1, + "quality": meta["quality"], + } + + try: + response = client.images.generate(**payload) + except Exception as exc: + logger.debug("OpenAI image generation failed", exc_info=True) + return error_response( + error=f"OpenAI image generation failed: {exc}", + error_type="api_error", + provider="openai", + model=tier_id, + prompt=prompt, + aspect_ratio=aspect, + ) data = getattr(response, "data", None) or [] if not data: @@ -302,6 +399,7 @@ class OpenAIImageGenProvider(ImageGenProvider): prompt=prompt, aspect_ratio=aspect, provider="openai", + modality=modality, extra=extra, ) diff --git a/plugins/image_gen/xai/__init__.py b/plugins/image_gen/xai/__init__.py index a8982393f7e..f487d90ada6 100644 --- a/plugins/image_gen/xai/__init__.py +++ b/plugins/image_gen/xai/__init__.py @@ -27,6 +27,7 @@ from agent.image_gen_provider import ( DEFAULT_ASPECT_RATIO, ImageGenProvider, error_response, + normalize_reference_images, resolve_aspect_ratio, save_b64_image, save_url_image, @@ -114,6 +115,31 @@ def _resolve_resolution() -> str: return DEFAULT_RESOLUTION +def _xai_image_field(source: str) -> Dict[str, str]: + """Build the xAI ``image`` field for an edit request. + + xAI's ``/v1/images/edits`` accepts ``{"url": , "type": "image_url"}`` + where ```` is a public URL or a base64 data URI. Public URLs and + existing data URIs pass through unchanged; local file paths are read and + encoded into a ``data:`` URI. + """ + source = source.strip() + lower = source.lower() + if lower.startswith(("http://", "https://", "data:")): + return {"url": source, "type": "image_url"} + # Local file path → base64 data URI. + import base64 + import os as _os + + with open(source, "rb") as fh: + raw = fh.read() + ext = (_os.path.splitext(source)[1].lstrip(".") or "png").lower() + if ext == "jpg": + ext = "jpeg" + b64 = base64.b64encode(raw).decode("utf-8") + return {"url": f"data:image/{ext};base64,{b64}", "type": "image_url"} + + # --------------------------------------------------------------------------- # Provider # --------------------------------------------------------------------------- @@ -153,18 +179,34 @@ class XAIImageGenProvider(ImageGenProvider): return { "name": "xAI Grok Imagine (image)", "badge": "paid", - "tag": "grok-imagine-image — text-to-image; uses xAI Grok OAuth or XAI_API_KEY", + "tag": "grok-imagine-image — text-to-image & image editing; uses xAI Grok OAuth or XAI_API_KEY", "env_vars": [], "post_setup": "xai_grok", } + def capabilities(self) -> Dict[str, Any]: + # xAI's /v1/images/edits supports image editing via grok-imagine-image + # -quality. Single primary source image (multi-image editing exists as + # a separate capability but we keep the primary edit surface here). + return {"modalities": ["text", "image"], "max_reference_images": 1} + def generate( self, prompt: str, aspect_ratio: str = DEFAULT_ASPECT_RATIO, + *, + image_url: Optional[str] = None, + reference_image_urls: Optional[List[str]] = None, **kwargs: Any, ) -> Dict[str, Any]: - """Generate an image using xAI's grok-imagine-image.""" + """Generate an image (text-to-image) or edit a source image (image-to-image). + + Routing: when ``image_url`` is provided, POST to ``/v1/images/edits`` + with the source image; otherwise POST to ``/v1/images/generations``. + Per xAI docs, editing uses the ``grok-imagine-image-quality`` model and + a JSON body (the OpenAI SDK's multipart ``images.edit()`` is NOT + supported by xAI). + """ creds = resolve_xai_http_credentials() api_key = str(creds.get("api_key") or "").strip() provider_name = str(creds.get("provider") or "xai").strip() or "xai" @@ -182,12 +224,17 @@ class XAIImageGenProvider(ImageGenProvider): resolution = _resolve_resolution() xai_res = resolution if resolution in _XAI_RESOLUTIONS else DEFAULT_RESOLUTION - payload: Dict[str, Any] = { - "model": model_id, - "prompt": prompt, - "aspect_ratio": xai_ar, - "resolution": xai_res, - } + # Pick the primary source image: explicit image_url wins, else the + # first reference image. + source_image = None + if isinstance(image_url, str) and image_url.strip(): + source_image = image_url.strip() + else: + refs = normalize_reference_images(reference_image_urls) + if refs: + source_image = refs[0] + is_edit = bool(source_image) + modality = "image" if is_edit else "text" headers = { "Authorization": f"Bearer {api_key}", @@ -197,9 +244,41 @@ class XAIImageGenProvider(ImageGenProvider): base_url = str(creds.get("base_url") or "https://api.x.ai/v1").strip().rstrip("/") + if is_edit: + # Editing requires the quality model per xAI docs. The source + # image may be a public URL or a base64 data URI; local file paths + # are converted to a data URI here. + edit_model = "grok-imagine-image-quality" + try: + image_field = _xai_image_field(source_image) + except Exception as exc: + return error_response( + error=f"Could not load source image for editing: {exc}", + error_type="io_error", + provider=provider_name, + model=edit_model, + prompt=prompt, + aspect_ratio=aspect, + ) + payload: Dict[str, Any] = { + "model": edit_model, + "prompt": prompt, + "image": image_field, + } + endpoint_url = f"{base_url}/images/edits" + model_id = edit_model + else: + payload = { + "model": model_id, + "prompt": prompt, + "aspect_ratio": xai_ar, + "resolution": xai_res, + } + endpoint_url = f"{base_url}/images/generations" + try: response = requests.post( - f"{base_url}/images/generations", + endpoint_url, headers=headers, json=payload, timeout=120, @@ -310,9 +389,9 @@ class XAIImageGenProvider(ImageGenProvider): aspect_ratio=aspect, ) - extra: Dict[str, Any] = { - "resolution": xai_res, - } + extra: Dict[str, Any] = {} + if not is_edit: + extra["resolution"] = xai_res return success_response( image=image_ref, @@ -320,6 +399,7 @@ class XAIImageGenProvider(ImageGenProvider): prompt=prompt, aspect_ratio=aspect, provider="xai", + modality=modality, extra=extra, ) diff --git a/tests/tools/test_image_generation.py b/tests/tools/test_image_generation.py index b24e6bc1fcc..df7d3a34abb 100644 --- a/tests/tools/test_image_generation.py +++ b/tests/tools/test_image_generation.py @@ -363,11 +363,16 @@ class TestAspectRatioNormalization: class TestRegistryIntegration: - def test_schema_exposes_only_prompt_and_aspect_ratio_to_agent(self, image_tool): - """The agent-facing schema must stay tight — model selection is a - user-level config choice, not an agent-level arg.""" + def test_schema_exposes_expected_agent_params(self, image_tool): + """The agent-facing schema exposes the unified text+image surface: + prompt (required), aspect_ratio, and the image-to-image inputs + image_url + reference_image_urls. Model selection stays a user-level + config choice, never an agent-level arg.""" props = image_tool.IMAGE_GENERATE_SCHEMA["parameters"]["properties"] - assert set(props.keys()) == {"prompt", "aspect_ratio"} + assert set(props.keys()) == { + "prompt", "aspect_ratio", "image_url", "reference_image_urls", + } + assert image_tool.IMAGE_GENERATE_SCHEMA["parameters"]["required"] == ["prompt"] def test_aspect_ratio_enum_is_three_values(self, image_tool): enum = image_tool.IMAGE_GENERATE_SCHEMA["parameters"]["properties"]["aspect_ratio"]["enum"] diff --git a/tests/tools/test_image_generation_artifacts.py b/tests/tools/test_image_generation_artifacts.py index 2a1ce111353..ea4fd37d01c 100644 --- a/tests/tools/test_image_generation_artifacts.py +++ b/tests/tools/test_image_generation_artifacts.py @@ -110,7 +110,7 @@ def test_handle_image_generate_postprocesses_plugin_result(monkeypatch, tmp_path monkeypatch.setattr( image_generation_tool, "_dispatch_to_plugin_provider", - lambda prompt, aspect_ratio: json.dumps({"success": True, "image": str(image_path)}), + lambda prompt, aspect_ratio, **kw: json.dumps({"success": True, "image": str(image_path)}), ) result = json.loads( diff --git a/tests/tools/test_image_generation_image_to_image.py b/tests/tools/test_image_generation_image_to_image.py new file mode 100644 index 00000000000..4e9d457a49f --- /dev/null +++ b/tests/tools/test_image_generation_image_to_image.py @@ -0,0 +1,349 @@ +"""Tests for the image-to-image / editing surface of ``image_generate``. + +Mirrors the video-gen image-to-video tests: the unified ``image_generate`` +tool routes to a provider's edit endpoint when ``image_url`` / +``reference_image_urls`` is supplied, otherwise to text-to-image. Coverage: + +- In-tree FAL edit payload construction (``_build_fal_edit_payload``) +- In-tree FAL routing (text vs edit endpoint) via ``image_generate_tool`` +- Plugin dispatch forwards image_url / reference_image_urls to ``generate()`` +- ``capabilities()`` honesty drives the dynamic tool-schema description +- Models without an edit endpoint reject image inputs with a clear error +""" + +from __future__ import annotations + +import json +from typing import Any, Dict, List, Optional + +import pytest +import yaml + +from agent import image_gen_registry +from agent.image_gen_provider import ImageGenProvider + + +@pytest.fixture(autouse=True) +def _reset_registry(): + image_gen_registry._reset_for_tests() + yield + image_gen_registry._reset_for_tests() + + +@pytest.fixture +def cfg_home(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + return tmp_path + + +def _write_cfg(home, cfg: dict): + (home / "config.yaml").write_text(yaml.safe_dump(cfg)) + + +# --------------------------------------------------------------------------- +# In-tree FAL edit payload + routing +# --------------------------------------------------------------------------- + + +class TestFalEditPayload: + def test_edit_payload_includes_image_urls(self): + from tools.image_generation_tool import _build_fal_edit_payload + + payload = _build_fal_edit_payload( + "fal-ai/nano-banana-pro", "make it night", ["https://x/y.png"], + "landscape", + ) + assert payload["prompt"] == "make it night" + assert payload["image_urls"] == ["https://x/y.png"] + # nano-banana edit advertises aspect_ratio in edit_supports + assert payload.get("aspect_ratio") == "16:9" + + def test_edit_payload_strips_keys_outside_edit_supports(self): + from tools.image_generation_tool import _build_fal_edit_payload + + # gpt-image-2 edit does NOT advertise image_size (auto-inferred), so + # it must be stripped even though the text-to-image path sets it. + payload = _build_fal_edit_payload( + "fal-ai/gpt-image-2", "swap bg", ["https://x/y.png"], "square", + ) + assert "image_size" not in payload + assert payload["image_urls"] == ["https://x/y.png"] + assert payload["quality"] == "medium" + + def test_text_only_model_has_no_edit_endpoint(self): + from tools.image_generation_tool import FAL_MODELS + + # z-image/turbo is a pure text-to-image model — no edit endpoint. + assert "edit_endpoint" not in FAL_MODELS["fal-ai/z-image/turbo"] + # while nano-banana-pro is edit-capable + assert FAL_MODELS["fal-ai/nano-banana-pro"].get("edit_endpoint") + + +class TestFalRouting: + def _patch_submit(self, monkeypatch, image_tool, capture: dict): + class _Handler: + def get(self_inner): + return {"images": [{"url": "https://out/img.png", "width": 1, "height": 1}]} + + def fake_submit(endpoint, arguments): + capture["endpoint"] = endpoint + capture["arguments"] = arguments + return _Handler() + + monkeypatch.setattr(image_tool, "_submit_fal_request", fake_submit) + monkeypatch.setattr(image_tool, "fal_key_is_configured", lambda: True) + monkeypatch.setattr(image_tool, "_resolve_managed_fal_gateway", lambda: None) + + def test_text_to_image_uses_base_endpoint(self, cfg_home, monkeypatch): + import tools.image_generation_tool as image_tool + + _write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/nano-banana-pro"}}) + capture: dict = {} + self._patch_submit(monkeypatch, image_tool, capture) + + raw = image_tool.image_generate_tool(prompt="a cat", aspect_ratio="square") + out = json.loads(raw) + assert out["success"] is True + assert out["modality"] == "text" + assert capture["endpoint"] == "fal-ai/nano-banana-pro" + assert "image_urls" not in capture["arguments"] + + def test_image_to_image_routes_to_edit_endpoint(self, cfg_home, monkeypatch): + import tools.image_generation_tool as image_tool + + _write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/nano-banana-pro"}}) + capture: dict = {} + self._patch_submit(monkeypatch, image_tool, capture) + + raw = image_tool.image_generate_tool( + prompt="make it night", + aspect_ratio="square", + image_url="https://in/src.png", + ) + out = json.loads(raw) + assert out["success"] is True + assert out["modality"] == "image" + assert capture["endpoint"] == "fal-ai/nano-banana-pro/edit" + assert capture["arguments"]["image_urls"] == ["https://in/src.png"] + + def test_reference_images_clamped_to_model_cap(self, cfg_home, monkeypatch): + import tools.image_generation_tool as image_tool + + # nano-banana-pro caps at 2 reference images. + _write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/nano-banana-pro"}}) + capture: dict = {} + self._patch_submit(monkeypatch, image_tool, capture) + + raw = image_tool.image_generate_tool( + prompt="blend", + image_url="https://in/a.png", + reference_image_urls=["https://in/b.png", "https://in/c.png", "https://in/d.png"], + ) + out = json.loads(raw) + assert out["success"] is True + assert capture["arguments"]["image_urls"] == ["https://in/a.png", "https://in/b.png"] + + def test_text_only_model_rejects_image_url(self, cfg_home, monkeypatch): + import tools.image_generation_tool as image_tool + + _write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/z-image/turbo"}}) + capture: dict = {} + self._patch_submit(monkeypatch, image_tool, capture) + + raw = image_tool.image_generate_tool( + prompt="edit this", image_url="https://in/src.png", + ) + out = json.loads(raw) + assert out["success"] is False + assert "image-to-image" in out["error"] + # Must NOT have submitted anything. + assert capture == {} + + def test_edit_skips_upscaler(self, cfg_home, monkeypatch): + import tools.image_generation_tool as image_tool + + # flux-2-pro has upscale=True for text-to-image, but edits must skip it. + _write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/flux-2-pro"}}) + capture: dict = {} + self._patch_submit(monkeypatch, image_tool, capture) + upscale_called = {"hit": False} + monkeypatch.setattr( + image_tool, "_upscale_image", + lambda *a, **k: upscale_called.__setitem__("hit", True) or None, + ) + + raw = image_tool.image_generate_tool( + prompt="tweak", image_url="https://in/src.png", + ) + out = json.loads(raw) + assert out["success"] is True + assert out["modality"] == "image" + assert upscale_called["hit"] is False + + +# --------------------------------------------------------------------------- +# Plugin dispatch forwarding +# --------------------------------------------------------------------------- + + +class _EditCapableProvider(ImageGenProvider): + def __init__(self): + self.received: Dict[str, Any] = {} + + @property + def name(self) -> str: + return "editcap" + + def capabilities(self) -> Dict[str, Any]: + return {"modalities": ["text", "image"], "max_reference_images": 4} + + def generate(self, prompt, aspect_ratio="landscape", *, image_url=None, + reference_image_urls=None, **kwargs): + self.received = { + "prompt": prompt, + "aspect_ratio": aspect_ratio, + "image_url": image_url, + "reference_image_urls": reference_image_urls, + } + return { + "success": True, "image": "/tmp/out.png", "model": "editcap-1", + "prompt": prompt, "aspect_ratio": aspect_ratio, + "modality": "image" if image_url else "text", "provider": "editcap", + } + + +class _LegacyProvider(ImageGenProvider): + """Provider whose generate() predates image_url (no **kwargs absorb).""" + + @property + def name(self) -> str: + return "legacy" + + def generate(self, prompt, aspect_ratio="landscape"): # narrow signature + return {"success": True, "image": "/tmp/legacy.png", "provider": "legacy"} + + +class TestPluginDispatchImageToImage: + def test_dispatch_forwards_image_url(self, cfg_home, monkeypatch): + import tools.image_generation_tool as image_tool + from hermes_cli import plugins as plugins_module + from agent import image_gen_registry as reg + + provider = _EditCapableProvider() + reg.register_provider(provider) + monkeypatch.setattr(image_tool, "_read_configured_image_provider", lambda: "editcap") + monkeypatch.setattr(plugins_module, "_ensure_plugins_discovered", lambda *a, **k: None) + monkeypatch.setattr(reg, "get_provider", lambda n: provider if n == "editcap" else None) + + raw = image_tool._dispatch_to_plugin_provider( + "make night", "square", + image_url="https://in/src.png", + reference_image_urls=["https://in/ref.png"], + ) + out = json.loads(raw) + assert out["success"] is True + assert out["modality"] == "image" + assert provider.received["image_url"] == "https://in/src.png" + assert provider.received["reference_image_urls"] == ["https://in/ref.png"] + + def test_dispatch_text_only_when_no_image(self, cfg_home, monkeypatch): + import tools.image_generation_tool as image_tool + from hermes_cli import plugins as plugins_module + from agent import image_gen_registry as reg + + provider = _EditCapableProvider() + reg.register_provider(provider) + monkeypatch.setattr(image_tool, "_read_configured_image_provider", lambda: "editcap") + monkeypatch.setattr(plugins_module, "_ensure_plugins_discovered", lambda *a, **k: None) + monkeypatch.setattr(reg, "get_provider", lambda n: provider if n == "editcap" else None) + + raw = image_tool._dispatch_to_plugin_provider("a dog", "landscape") + out = json.loads(raw) + assert out["success"] is True + assert provider.received["image_url"] is None + assert "reference_image_urls" not in provider.received or provider.received["reference_image_urls"] is None + + def test_legacy_provider_edit_request_surfaces_clear_error(self, cfg_home, monkeypatch): + import tools.image_generation_tool as image_tool + from hermes_cli import plugins as plugins_module + from agent import image_gen_registry as reg + + provider = _LegacyProvider() + reg.register_provider(provider) + monkeypatch.setattr(image_tool, "_read_configured_image_provider", lambda: "legacy") + monkeypatch.setattr(plugins_module, "_ensure_plugins_discovered", lambda *a, **k: None) + monkeypatch.setattr(reg, "get_provider", lambda n: provider if n == "legacy" else None) + + raw = image_tool._dispatch_to_plugin_provider( + "edit it", "square", image_url="https://in/src.png", + ) + out = json.loads(raw) + assert out["success"] is False + assert out["error_type"] == "modality_unsupported" + + +# --------------------------------------------------------------------------- +# Dynamic schema reflects active capabilities +# --------------------------------------------------------------------------- + + +class _PluginBothProvider(ImageGenProvider): + @property + def name(self) -> str: + return "both" + + def is_available(self) -> bool: + return True + + def default_model(self) -> Optional[str]: + return "both-v1" + + def capabilities(self) -> Dict[str, Any]: + return {"modalities": ["text", "image"], "max_reference_images": 5} + + def generate(self, prompt, aspect_ratio="landscape", *, image_url=None, + reference_image_urls=None, **kwargs): + return {"success": True} + + +class TestDynamicSchema: + def _no_discovery(self, monkeypatch): + import hermes_cli.plugins as plugins_module + monkeypatch.setattr(plugins_module, "_ensure_plugins_discovered", lambda *a, **k: None) + + def test_fal_edit_model_advertises_both(self, cfg_home, monkeypatch): + from tools.image_generation_tool import _build_dynamic_image_schema + + _write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/nano-banana-pro"}}) + desc = _build_dynamic_image_schema()["description"] + assert "text-to-image" in desc and "image-to-image" in desc + assert "routes automatically" in desc + + def test_fal_text_only_model_warns(self, cfg_home, monkeypatch): + from tools.image_generation_tool import _build_dynamic_image_schema + + _write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/z-image/turbo"}}) + desc = _build_dynamic_image_schema()["description"] + assert "text-to-image only" in desc + assert "NOT capable of image-to-image" in desc + + def test_plugin_both_provider_advertises_refs(self, cfg_home, monkeypatch): + from tools.image_generation_tool import _build_dynamic_image_schema + from agent import image_gen_registry as reg + + _write_cfg(cfg_home, {"image_gen": {"provider": "both"}}) + reg.register_provider(_PluginBothProvider()) + self._no_discovery(monkeypatch) + + desc = _build_dynamic_image_schema()["description"] + assert "image-to-image / editing" in desc + assert "up to 5 reference image(s)" in desc + + def test_builder_wired_into_registry(self): + from tools.registry import discover_builtin_tools, registry + + discover_builtin_tools() + entry = registry._tools["image_generate"] + assert entry.dynamic_schema_overrides is not None + out = entry.dynamic_schema_overrides() + assert "description" in out diff --git a/tools/image_generation_tool.py b/tools/image_generation_tool.py index d7eeb30d175..3213068ddd9 100644 --- a/tools/image_generation_tool.py +++ b/tools/image_generation_tool.py @@ -116,6 +116,14 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { "output_format", "enable_safety_checker", }, "upscale": False, + # Image-to-image / editing: FLUX.2 [klein] 9B edit endpoint takes + # `image_urls` (list). Natural-language edits, multi-ref. + "edit_endpoint": "fal-ai/flux-2/klein/9b/edit", + "edit_supports": { + "prompt", "image_urls", "num_inference_steps", "seed", + "output_format", "enable_safety_checker", + }, + "max_reference_images": 9, }, "fal-ai/flux-2-pro": { "display": "FLUX 2 Pro", @@ -143,6 +151,14 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { "safety_tolerance", "sync_mode", "seed", }, "upscale": True, # Backward-compat: current default behavior. + # Edit endpoint accepts up to 9 reference images. + "edit_endpoint": "fal-ai/flux-2-pro/edit", + "edit_supports": { + "prompt", "image_urls", "num_inference_steps", "guidance_scale", + "num_images", "output_format", "enable_safety_checker", + "safety_tolerance", "sync_mode", "seed", + }, + "max_reference_images": 9, }, "fal-ai/z-image/turbo": { "display": "Z-Image Turbo", @@ -194,6 +210,15 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { "enable_web_search", "limit_generations", }, "upscale": False, + # Nano Banana Pro edit (Gemini 3 Pro Image): natural-language edits + # with up to 2 reference images via `image_urls`. + "edit_endpoint": "fal-ai/nano-banana-pro/edit", + "edit_supports": { + "prompt", "image_urls", "aspect_ratio", "num_images", + "output_format", "safety_tolerance", "seed", "sync_mode", + "resolution", "enable_web_search", "limit_generations", + }, + "max_reference_images": 2, }, "fal-ai/gpt-image-1.5": { "display": "GPT Image 1.5", @@ -218,6 +243,13 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { "background", "sync_mode", }, "upscale": False, + # Edit endpoint: high-fidelity edits preserving composition/lighting. + "edit_endpoint": "fal-ai/gpt-image-1.5/edit", + "edit_supports": { + "prompt", "image_urls", "image_size", "quality", "num_images", + "output_format", "sync_mode", + }, + "max_reference_images": 16, }, "fal-ai/gpt-image-2": { "display": "GPT Image 2", @@ -250,6 +282,15 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { # through the shared FAL billing path. }, "upscale": False, + # GPT Image 2 edit endpoint lives under the OpenAI namespace on FAL + # (NOT fal-ai/). Takes `image_urls` (list) + optional mask. We don't + # send `image_size` on edit so the model auto-infers from input. + "edit_endpoint": "openai/gpt-image-2/edit", + "edit_supports": { + "prompt", "image_urls", "quality", "num_images", "output_format", + "sync_mode", "mask_image_url", + }, + "max_reference_images": 16, }, "fal-ai/ideogram/v3": { "display": "Ideogram V3", @@ -272,6 +313,13 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { "style", "seed", }, "upscale": False, + # Ideogram V3 edit endpoint takes `image_urls` (list). + "edit_endpoint": "fal-ai/ideogram/v3/edit", + "edit_supports": { + "prompt", "image_urls", "rendering_speed", "expand_prompt", + "style", "seed", + }, + "max_reference_images": 1, }, "fal-ai/recraft/v4/pro/text-to-image": { "display": "Recraft V4 Pro", @@ -317,6 +365,14 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { "num_images", "output_format", "acceleration", "seed", "sync_mode", }, "upscale": False, + # Qwen edit uses the Qwen Image 2.0 Pro editing endpoint, which takes + # `image_urls` (list) + natural-language edit instructions. + "edit_endpoint": "fal-ai/qwen-image-2/pro/edit", + "edit_supports": { + "prompt", "image_urls", "num_inference_steps", "guidance_scale", + "num_images", "output_format", "acceleration", "seed", "sync_mode", + }, + "max_reference_images": 3, }, # Krea 2 — Krea's first foundation image model, day-0 partner launch on # fal (2026-05-27). Same model family as our direct ``plugins/image_gen/krea`` @@ -554,6 +610,55 @@ def _build_fal_payload( return {k: v for k, v in payload.items() if k in supports} +def _build_fal_edit_payload( + model_id: str, + prompt: str, + image_urls: list, + aspect_ratio: str = DEFAULT_ASPECT_RATIO, + seed: Optional[int] = None, + overrides: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """Build a FAL *edit* request payload (image-to-image) from unified inputs. + + Every FAL edit endpoint takes ``image_urls`` (a list of source/reference + image URLs) plus the prompt. Size handling differs from text-to-image: + most edit endpoints auto-infer output dimensions from the input image, so + we only send ``image_size`` / ``aspect_ratio`` when the edit endpoint's + ``edit_supports`` whitelist accepts it. Keys outside ``edit_supports`` are + stripped before submission. + """ + meta = FAL_MODELS[model_id] + edit_supports = meta.get("edit_supports") or set() + size_style = meta["size_style"] + sizes = meta["sizes"] + + aspect = (aspect_ratio or DEFAULT_ASPECT_RATIO).lower().strip() + if aspect not in sizes: + aspect = DEFAULT_ASPECT_RATIO + + payload: Dict[str, Any] = dict(meta.get("defaults", {})) + payload["prompt"] = (prompt or "").strip() + payload["image_urls"] = list(image_urls) + + # Only express output size when the edit endpoint advertises the key. + # gpt-image-2 edit auto-infers size from the input, so `image_size` is + # intentionally absent from its edit_supports whitelist. + if size_style in {"image_size_preset", "gpt_literal"} and "image_size" in edit_supports: + payload["image_size"] = sizes[aspect] + elif size_style == "aspect_ratio" and "aspect_ratio" in edit_supports: + payload["aspect_ratio"] = sizes[aspect] + + if seed is not None and isinstance(seed, int): + payload["seed"] = seed + + if overrides: + for k, v in overrides.items(): + if v is not None: + payload[k] = v + + return {k: v for k, v in payload.items() if k in edit_supports} + + # --------------------------------------------------------------------------- # Upscaler # --------------------------------------------------------------------------- @@ -729,19 +834,39 @@ def image_generate_tool( num_images: Optional[int] = None, output_format: Optional[str] = None, seed: Optional[int] = None, + image_url: Optional[str] = None, + reference_image_urls: Optional[list] = None, ) -> str: - """Generate an image from a text prompt using the configured FAL model. + """Generate an image from a text prompt, or edit a source image, via FAL. - The agent-facing schema exposes only ``prompt`` and ``aspect_ratio``; the - remaining kwargs are overrides for direct Python callers and are filtered - per-model via the ``supports`` whitelist (unsupported overrides are - silently dropped so legacy callers don't break when switching models). + Routing: when ``image_url`` (or ``reference_image_urls``) is provided AND + the configured model declares an ``edit_endpoint``, the call routes to that + image-to-image / edit endpoint; otherwise it's plain text-to-image. + + The agent-facing schema exposes ``prompt``, ``aspect_ratio``, ``image_url`` + and ``reference_image_urls``; the remaining kwargs are overrides for direct + Python callers and are filtered per-model via the ``supports`` / + ``edit_supports`` whitelist (unsupported overrides are silently dropped so + legacy callers don't break when switching models). Returns a JSON string with ``{"success": bool, "image": url | None, - "error": str, "error_type": str}``. + "modality": "text" | "image", "error": str, "error_type": str}``. """ model_id, meta = _resolve_fal_model() + # Collect any source images (primary + references) into one ordered list. + source_images: list = [] + if isinstance(image_url, str) and image_url.strip(): + source_images.append(image_url.strip()) + if isinstance(reference_image_urls, (list, tuple)): + for ref in reference_image_urls: + if isinstance(ref, str) and ref.strip(): + source_images.append(ref.strip()) + + edit_endpoint = meta.get("edit_endpoint") + use_edit = bool(source_images) and bool(edit_endpoint) + modality = "image" if use_edit else "text" + debug_call_data = { "model": model_id, "parameters": { @@ -752,6 +877,8 @@ def image_generate_tool( "num_images": num_images, "output_format": output_format, "seed": seed, + "modality": modality, + "source_images": len(source_images), }, "error": None, "success": False, @@ -768,6 +895,17 @@ def image_generate_tool( if not (fal_key_is_configured() or _resolve_managed_fal_gateway()): raise ValueError(_build_no_backend_setup_message()) + # If the caller supplied source images but the active model has no + # edit endpoint, fail with a clear, actionable message instead of + # silently dropping the images and producing an unrelated picture. + if source_images and not edit_endpoint: + raise ValueError( + f"Model '{meta.get('display', model_id)}' ({model_id}) is not " + f"capable of image-to-image / editing. Provide a text-only " + f"prompt (omit image_url), or switch to an edit-capable model " + f"via `hermes tools` → Image Generation." + ) + aspect_lc = (aspect_ratio or DEFAULT_ASPECT_RATIO).lower().strip() if aspect_lc not in VALID_ASPECT_RATIOS: logger.warning( @@ -786,16 +924,31 @@ def image_generate_tool( if output_format is not None: overrides["output_format"] = output_format - arguments = _build_fal_payload( - model_id, prompt, aspect_lc, seed=seed, overrides=overrides, - ) + if use_edit: + # Clamp reference count to the model's declared cap. + max_refs = int(meta.get("max_reference_images") or 1) + clamped_sources = source_images[:max_refs] if max_refs > 0 else source_images + arguments = _build_fal_edit_payload( + model_id, prompt, clamped_sources, aspect_lc, + seed=seed, overrides=overrides, + ) + endpoint = edit_endpoint + logger.info( + "Editing image with %s (%s) — %d source image(s), prompt: %s", + meta.get("display", model_id), endpoint, len(clamped_sources), + prompt[:80], + ) + else: + arguments = _build_fal_payload( + model_id, prompt, aspect_lc, seed=seed, overrides=overrides, + ) + endpoint = model_id + logger.info( + "Generating image with %s (%s) — prompt: %s", + meta.get("display", model_id), model_id, prompt[:80], + ) - logger.info( - "Generating image with %s (%s) — prompt: %s", - meta.get("display", model_id), model_id, prompt[:80], - ) - - handler = _submit_fal_request(model_id, arguments=arguments) + handler = _submit_fal_request(endpoint, arguments=arguments) result = handler.get() generation_time = (datetime.datetime.now() - start_time).total_seconds() @@ -807,7 +960,9 @@ def image_generate_tool( if not images: raise ValueError("No images were generated") - should_upscale = bool(meta.get("upscale", False)) + # Edit endpoints already return the final composition; the Clarity + # upscaler is a text-to-image quality pass, so skip it for edits. + should_upscale = bool(meta.get("upscale", False)) and not use_edit formatted_images = [] for img in images: @@ -834,13 +989,15 @@ def image_generate_tool( upscaled_count = sum(1 for img in formatted_images if img.get("upscaled")) logger.info( - "Generated %s image(s) in %.1fs (%s upscaled) via %s", - len(formatted_images), generation_time, upscaled_count, model_id, + "Generated %s image(s) in %.1fs (%s upscaled) via %s [%s]", + len(formatted_images), generation_time, upscaled_count, endpoint, + modality, ) response_data = { "success": True, "image": formatted_images[0]["url"] if formatted_images else None, + "modality": modality, } debug_call_data["success"] = True @@ -1001,22 +1158,34 @@ from tools.registry import registry, tool_error IMAGE_GENERATE_SCHEMA = { "name": "image_generate", + # Placeholder — the real description is rebuilt dynamically at + # get_tool_definitions() time so it reflects the active backend's actual + # capabilities (whether the selected model supports image-to-image / + # editing). See _build_dynamic_image_schema() below and the + # dynamic-tool-schemas skill. "description": ( - "Generate high-quality images from text prompts. The underlying " - "backend (FAL, OpenAI, etc.) and model are user-configured and not " - "selectable by the agent. Returns either a URL or an absolute file " - "path in the `image` field; display it with markdown " - "![description](url-or-path) and the gateway will deliver it. When " - "the active terminal backend has a different filesystem, successful " - "local-file results may also include `agent_visible_image` for " - "follow-up terminal/file operations." + "Generate high-quality images from text prompts (text-to-image), or " + "edit / transform an existing image (image-to-image) when the active " + "model supports it. Pass `image_url` to edit that image; add " + "`reference_image_urls` for style/composition references; omit both " + "for text-to-image. The underlying backend (FAL, OpenAI, xAI, etc.) " + "and model are user-configured and not selectable by the agent. " + "Returns either a URL or an absolute file path in the `image` field; " + "display it with markdown ![description](url-or-path) and the gateway " + "will deliver it. When the active terminal backend has a different " + "filesystem, successful local-file results may also include " + "`agent_visible_image` for follow-up terminal/file operations." ), "parameters": { "type": "object", "properties": { "prompt": { "type": "string", - "description": "The text prompt describing the desired image. Be detailed and descriptive.", + "description": ( + "The text prompt describing the desired image (text-to-" + "image) or the edit to apply (image-to-image). Be detailed " + "and descriptive." + ), }, "aspect_ratio": { "type": "string", @@ -1024,6 +1193,28 @@ IMAGE_GENERATE_SCHEMA = { "description": "The aspect ratio of the generated image. 'landscape' is 16:9 wide, 'portrait' is 16:9 tall, 'square' is 1:1.", "default": DEFAULT_ASPECT_RATIO, }, + "image_url": { + "type": "string", + "description": ( + "Optional source image to edit/transform (image-to-image). " + "When provided, the active backend routes to its image " + "editing endpoint; when omitted, it generates from text " + "alone. Pass a public URL or an absolute local file path " + "from the conversation. Only honored by models that " + "support editing — the description above indicates whether " + "the active model does." + ), + }, + "reference_image_urls": { + "type": "array", + "items": {"type": "string"}, + "description": ( + "Optional list of additional reference image URLs / paths " + "(style, character, or composition references) to guide an " + "image-to-image edit. Supported only by some models and " + "capped per-model; the description above indicates the max." + ), + }, }, "required": ["prompt"], }, @@ -1069,7 +1260,12 @@ def _read_configured_image_provider(): return None -def _dispatch_to_plugin_provider(prompt: str, aspect_ratio: str): +def _dispatch_to_plugin_provider( + prompt: str, + aspect_ratio: str, + image_url: Optional[str] = None, + reference_image_urls: Optional[list] = None, +): """Route the call to a plugin-registered provider when one is selected. Returns a JSON string on dispatch, or ``None`` to fall through to the @@ -1080,6 +1276,10 @@ def _dispatch_to_plugin_provider(prompt: str, aspect_ratio: str): ``plugins/image_gen/fal/`` plugin (the plugin re-enters this module's pipeline via ``_it`` indirection so behavior is identical to the direct call, just routed through the registry). + + ``image_url`` / ``reference_image_urls`` enable image-to-image / editing: + they are forwarded to the provider's ``generate()`` so the backend can + route to its edit endpoint. """ configured = _read_configured_image_provider() if not configured: @@ -1122,11 +1322,53 @@ def _dispatch_to_plugin_provider(prompt: str, aspect_ratio: str): "error_type": "provider_not_registered", }) + kwargs: Dict[str, Any] = {"prompt": prompt, "aspect_ratio": aspect_ratio} try: - kwargs = {"prompt": prompt, "aspect_ratio": aspect_ratio} if configured_model: kwargs["model"] = configured_model + if isinstance(image_url, str) and image_url.strip(): + kwargs["image_url"] = image_url.strip() + norm_refs = None + if reference_image_urls is not None: + from agent.image_gen_provider import normalize_reference_images + + norm_refs = normalize_reference_images(reference_image_urls) + if norm_refs: + kwargs["reference_image_urls"] = norm_refs result = provider.generate(**kwargs) + except TypeError as exc: + # A provider whose generate() signature predates image_url support + # (third-party plugin not yet updated) — retry without the new kwargs + # so text-to-image keeps working, but surface a clear note when the + # user actually asked for an edit. + if "image_url" in kwargs or "reference_image_urls" in kwargs: + logger.warning( + "image_gen provider '%s' rejected image-to-image kwargs " + "(signature too narrow): %s", + getattr(provider, "name", "?"), exc, + ) + return json.dumps({ + "success": False, + "image": None, + "error": ( + f"Provider '{getattr(provider, 'name', '?')}' does not " + f"support image-to-image / editing (its generate() " + f"signature is out of date with the image_generate schema). " + f"Omit image_url for text-to-image, or pick a backend that " + f"supports editing via `hermes tools` → Image Generation." + ), + "error_type": "modality_unsupported", + }) + logger.warning( + "Image gen provider '%s' raised TypeError: %s", + getattr(provider, "name", "?"), exc, + ) + return json.dumps({ + "success": False, + "image": None, + "error": f"Provider '{getattr(provider, 'name', '?')}' error: {exc}", + "error_type": "provider_exception", + }) except Exception as exc: logger.warning( "Image gen provider '%s' raised: %s", @@ -1153,21 +1395,144 @@ def _handle_image_generate(args, **kw): if not prompt: return tool_error("prompt is required for image generation") aspect_ratio = args.get("aspect_ratio", DEFAULT_ASPECT_RATIO) + image_url = args.get("image_url") + reference_image_urls = args.get("reference_image_urls") task_id = kw.get("task_id") # Route to a plugin-registered provider if one is active (and it's # not the in-tree FAL path). - dispatched = _dispatch_to_plugin_provider(prompt, aspect_ratio) + dispatched = _dispatch_to_plugin_provider( + prompt, aspect_ratio, + image_url=image_url, + reference_image_urls=reference_image_urls, + ) if dispatched is not None: return _postprocess_image_generate_result(dispatched, task_id=task_id) raw = image_generate_tool( prompt=prompt, aspect_ratio=aspect_ratio, + image_url=image_url, + reference_image_urls=reference_image_urls, ) return _postprocess_image_generate_result(raw, task_id=task_id) +# --------------------------------------------------------------------------- +# Dynamic schema — reflect the active backend's image-to-image capability +# --------------------------------------------------------------------------- +# +# Why dynamic: whether the active model supports image-to-image / editing +# depends entirely on the user's configured backend + model. Telling the +# model up front ("the active model is text-to-image only — image_url will be +# rejected") saves a wasted turn. Memoized by config.yaml mtime in +# model_tools.get_tool_definitions(), so it rebuilds when the user switches +# model/provider via `hermes tools` or `/skills`. + + +_GENERIC_IMAGE_DESCRIPTION = IMAGE_GENERATE_SCHEMA["description"] + + +def _active_image_capabilities() -> Dict[str, Any]: + """Best-effort: return the active backend/model's image capabilities. + + Resolution order mirrors the runtime dispatch: + 1. If ``image_gen.provider`` is set, ask that plugin provider. + 2. Otherwise inspect the in-tree FAL model catalog for the active model. + + Returns a dict like ``{"modalities": [...], "max_reference_images": N, + "model": "...", "provider": "..."}``. Never raises. + """ + info: Dict[str, Any] = {"modalities": ["text"], "max_reference_images": 0} + + configured_provider = _read_configured_image_provider() + if configured_provider and configured_provider != "fal": + try: + from agent.image_gen_registry import get_provider + from hermes_cli.plugins import _ensure_plugins_discovered + + _ensure_plugins_discovered() + provider = get_provider(configured_provider) + if provider is not None: + caps = {} + try: + caps = provider.capabilities() or {} + except Exception: # noqa: BLE001 + caps = {} + info["provider"] = provider.display_name + info["model"] = _read_configured_image_model() or (provider.default_model() or "") + if caps.get("modalities"): + info["modalities"] = list(caps["modalities"]) + if caps.get("max_reference_images"): + info["max_reference_images"] = int(caps["max_reference_images"]) + return info + except Exception: # noqa: BLE001 + pass + + # In-tree FAL path (provider unset or == "fal"). + try: + model_id, meta = _resolve_fal_model() + info["provider"] = "FAL.ai" + info["model"] = meta.get("display", model_id) + if meta.get("edit_endpoint"): + info["modalities"] = ["text", "image"] + info["max_reference_images"] = int(meta.get("max_reference_images") or 1) + else: + info["modalities"] = ["text"] + info["max_reference_images"] = 0 + except Exception: # noqa: BLE001 + pass + + return info + + +def _build_dynamic_image_schema() -> Dict[str, Any]: + """Build a description reflecting whether the active model supports editing.""" + parts = [_GENERIC_IMAGE_DESCRIPTION] + + try: + info = _active_image_capabilities() + except Exception: # noqa: BLE001 + return {"description": _GENERIC_IMAGE_DESCRIPTION} + + provider = info.get("provider") + model = info.get("model") + modalities = set(info.get("modalities") or ["text"]) + + line = "\nActive backend" + if provider: + line += f": {provider}" + if model: + line += f" · model: {model}" + parts.append(line) + + if "image" in modalities and "text" in modalities: + max_refs = info.get("max_reference_images") or 0 + ref_note = ( + f"; up to {max_refs} reference image(s) via reference_image_urls" + if max_refs and max_refs > 1 + else "" + ) + parts.append( + "- supports both text-to-image (omit image_url) and " + f"image-to-image / editing (pass image_url){ref_note} — " + "routes automatically" + ) + elif "image" in modalities and "text" not in modalities: + parts.append( + "- this model is image-to-image / edit only — image_url is REQUIRED" + ) + else: + parts.append( + "- this model is text-to-image only — it is NOT capable of " + "image-to-image / editing; do not pass image_url or " + "reference_image_urls (they will be rejected). Provide a " + "text-only prompt." + ) + + return {"description": "\n".join(parts)} + + registry.register( name="image_generate", toolset="image_gen", @@ -1177,4 +1542,5 @@ registry.register( requires_env=[], is_async=False, # sync fal_client API to avoid "Event loop is closed" in gateway emoji="🎨", + dynamic_schema_overrides=_build_dynamic_image_schema, ) diff --git a/website/docs/developer-guide/image-gen-provider-plugin.md b/website/docs/developer-guide/image-gen-provider-plugin.md index c9823d1cedd..b746ce82229 100644 --- a/website/docs/developer-guide/image-gen-provider-plugin.md +++ b/website/docs/developer-guide/image-gen-provider-plugin.md @@ -47,6 +47,7 @@ from agent.image_gen_provider import ( DEFAULT_ASPECT_RATIO, ImageGenProvider, error_response, + normalize_reference_images, resolve_aspect_ratio, save_b64_image, success_response, @@ -112,10 +113,20 @@ class MyBackendImageGenProvider(ImageGenProvider): ], } + def capabilities(self) -> Dict[str, Any]: + # Declare whether this backend supports image-to-image / editing. + # The tool layer surfaces this in the dynamic schema so the model + # knows when `image_url` is honored. Default (if you omit this) is + # text-only: {"modalities": ["text"], "max_reference_images": 0}. + return {"modalities": ["text", "image"], "max_reference_images": 4} + def generate( self, prompt: str, aspect_ratio: str = DEFAULT_ASPECT_RATIO, + *, + image_url: Optional[str] = None, + reference_image_urls: Optional[List[str]] = None, **kwargs: Any, ) -> Dict[str, Any]: prompt = (prompt or "").strip() @@ -130,6 +141,15 @@ class MyBackendImageGenProvider(ImageGenProvider): aspect_ratio=aspect_ratio, ) + # Routing: if image_url (or reference_image_urls) is set, the call is + # an image-to-image / edit request; otherwise text-to-image. Report + # which path you took via the `modality` field of success_response. + sources = [] + if image_url: + sources.append(image_url) + sources.extend(normalize_reference_images(reference_image_urls) or []) + modality = "image" if sources else "text" + # Model selection precedence: env var → config → default. The helper # _resolve_model() in the built-in openai plugin is a good reference. model_id = kwargs.get("model") or self.default_model() or "my-model-fast" @@ -137,11 +157,18 @@ class MyBackendImageGenProvider(ImageGenProvider): try: import my_backend_sdk client = my_backend_sdk.Client(api_key=os.environ["MY_BACKEND_API_KEY"]) - result = client.generate( - prompt=prompt, - model=model_id, - aspect_ratio=aspect_ratio, - ) + if modality == "image": + result = client.edit( + prompt=prompt, + model=model_id, + image_urls=sources, + ) + else: + result = client.generate( + prompt=prompt, + model=model_id, + aspect_ratio=aspect_ratio, + ) # Two shapes supported: # - URL string: return it as `image` @@ -162,6 +189,7 @@ class MyBackendImageGenProvider(ImageGenProvider): prompt=prompt, aspect_ratio=aspect_ratio, provider=self.name, + modality=modality, ) except Exception as exc: return error_response( diff --git a/website/docs/reference/tools-reference.md b/website/docs/reference/tools-reference.md index 2393a9db7d1..1f6b86c0063 100644 --- a/website/docs/reference/tools-reference.md +++ b/website/docs/reference/tools-reference.md @@ -114,7 +114,7 @@ Scoped to the Feishu document-comment handler. Drives comment read/write operati | Tool | Description | Requires environment | |------|-------------|----------------------| -| `image_generate` | Generate high-quality images from text prompts using FAL.ai. The underlying model is user-configured (default: FLUX 2 Klein 9B, sub-1s generation) and is not selectable by the agent. Returns a single image URL. Display it using… | FAL_KEY | +| `image_generate` | Generate images from text prompts (text-to-image) or edit/transform an existing image (image-to-image) via the user-configured backend (FAL.ai, OpenAI, xAI, Krea). Pass `image_url` to edit an image and `reference_image_urls` for style references; omit both for text-to-image. The model is user-configured and not selectable by the agent. Returns a single image URL or local path. | FAL_KEY / OPENAI_API_KEY / xAI OAuth / KREA_API_KEY | ## `kanban` toolset diff --git a/website/docs/user-guide/features/image-generation.md b/website/docs/user-guide/features/image-generation.md index 4f225ee00b1..62dfe7bd127 100644 --- a/website/docs/user-guide/features/image-generation.md +++ b/website/docs/user-guide/features/image-generation.md @@ -86,6 +86,46 @@ Create a square portrait of a wise old owl — use the typography model Make me a futuristic cityscape, landscape orientation ``` +## Image-to-Image / Editing + +The same `image_generate` tool also **edits existing images** when the active +model supports it — pass a source image and the backend routes to its editing +endpoint automatically (mirrors how `video_generate` handles image-to-video). +Omit the source image and it's plain text-to-image. + +``` +Take this photo and make it a rainy Tokyo street at night → +``` + +``` +Blend these two product shots into one hero image → +``` + +Two inputs drive the edit: + +- **`image_url`** — the primary source image to edit/transform (public URL or local path). +- **`reference_image_urls`** — additional style/composition references (capped per-model). + +### Which backends support editing + +| Backend | Image-to-image | Reference cap | How | +|---|---|---|---| +| **FAL.ai** (edit-capable models below) | ✓ | up to 9 | routes to the model's `/edit` endpoint | +| **OpenAI** (`gpt-image-2`) | ✓ | up to 16 | `images.edit()` | +| **xAI** (Grok Imagine) | ✓ | 1 | `/v1/images/edits` (`grok-imagine-image-quality`) | +| **Krea** (`Krea 2`) | ✓ | up to 10 | reference-guided generation (`image_style_references`) | +| **OpenAI (Codex auth)** | ✗ | — | text-to-image only | + +FAL models with an editing endpoint: `flux-2/klein/9b`, `flux-2-pro`, +`nano-banana-pro`, `gpt-image-1.5`, `gpt-image-2`, `ideogram/v3`, and +`qwen-image`. Pure text-to-image FAL models (`z-image/turbo`, `recraft`, +`krea/*`) reject image inputs with a clear error pointing you at an +edit-capable model. + +The active model's editing capability is surfaced in the tool description at +runtime, so the agent knows whether `image_url` will be honored before it +calls the tool. + ## Aspect Ratios Every model accepts the same three aspect ratios from the agent's perspective. Internally, each model's native size spec is filled in automatically: @@ -152,7 +192,7 @@ Debug logs go to `./logs/image_tools_debug_.json` with per-call deta ## Limitations -- **Requires FAL credentials** (direct `FAL_KEY` or Nous Subscription) -- **Text-to-image only** — no inpainting, img2img, or editing via this tool -- **Temporary URLs** — FAL returns hosted URLs that expire after hours/days; save locally if needed -- **Per-model constraints** — some models don't support `seed`, `num_inference_steps`, etc. The `supports` filter silently drops unsupported params; this is expected behavior +- **Requires credentials** for the active backend (FAL `FAL_KEY` / Nous Subscription, `OPENAI_API_KEY`, xAI OAuth, `KREA_API_KEY`) +- **Editing is model-dependent** — image-to-image works only on edit-capable models (see the table above); text-to-image-only models reject image inputs with a clear error +- **Temporary URLs** — backends return hosted URLs that expire after hours/days; Hermes materializes them to the local cache so delivery still works after expiry +- **Per-model constraints** — some models don't support `seed`, `num_inference_steps`, etc. The `supports` / `edit_supports` filter silently drops unsupported params; this is expected behavior