feat(image-gen): add image-to-image / editing to image_generate (#48705)

* feat(image-gen): add image-to-image / editing to image_generate Brings image generation to parity with video generation: the unified image_generate tool now edits/transforms a source image (image-to-image) when given image_url / reference_image_urls, routing to each backend's edit endpoint, exactly as video_generate routes to image-to-video. - ImageGenProvider ABC: generate() gains keyword-only image_url + reference_image_urls; new capabilities() declares modalities + max_reference_images (defaults to text-only, backward compatible). success_response gains a modality field; adds normalize_reference_images. - image_generate tool: schema exposes image_url + reference_image_urls; dynamic schema reflects the active model's actual edit capability so the agent knows when image_url is honored. Handler + plugin dispatch forward the new inputs; legacy/text-only providers get a clear modality_unsupported error instead of silently dropping the source image. - In-tree FAL: 7 models gain edit endpoints (flux-2-klein, flux-2-pro, nano-banana-pro, gpt-image-1.5, gpt-image-2, ideogram/v3, qwen-image) with per-model edit_supports whitelists + reference caps; routes to the /edit endpoint and skips the upscaler for edits. - Plugins: openai (images.edit, 16 refs), xai (/v1/images/edits via grok-imagine-image-quality, JSON body per xAI docs), krea (image_style_references, 10 refs). openai-codex stays text-only and rejects edits with an actionable error. - Tests: 15 new (payload, routing, dispatch forwarding, dynamic schema, capabilities); updated 2 change-detector/lambda tests for the new schema. - Docs: image-generation feature page, image-gen provider plugin guide, tools reference. * fix(image-gen): preserve legacy passthrough in fal/krea plugin tests Two existing plugin tests asserted pre-image-to-image behavior: - fal: forward image_url/reference_image_urls only when supplied, so a text-to-image delegation stays byte-identical (no None kwargs). - krea: keep dict-shaped image_style_references refs verbatim (the unified string refs go through normalize_reference_images; legacy non-string ref objects pass through unchanged) — fixes KeyError when callers pass the richer Krea ref-object shape. * fix(image-gen): clearer not-capable message for text-to-image-only models When a text-to-image-only model (incl. gpt-image-2 on the Codex OAuth path, which can't do editing through the Responses image_generation tool) gets a source image, say 'this model is not capable of image-to-image / editing — provide a text-only prompt' rather than sending the user shopping for other backends. Applies to the openai-codex guard, the in-tree FAL no-edit-endpoint error, and the dynamic tool-schema text-only line.
2026-06-21 10:22:18 +00:00 · 2026-06-18 22:13:07 -07:00 · 2026-06-18 22:13:07 -07:00 · c02192ff6a
commit c02192ff6a
parent cfb55de5ea
13 changed files with 1239 additions and 106 deletions
--- a/tools/image_generation_tool.py
+++ b/tools/image_generation_tool.py
@ -116,6 +116,14 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "output_format", "enable_safety_checker",
        },
        "upscale": False,
+        # Image-to-image / editing: FLUX.2 [klein] 9B edit endpoint takes
+        # `image_urls` (list). Natural-language edits, multi-ref.
+        "edit_endpoint": "fal-ai/flux-2/klein/9b/edit",
+        "edit_supports": {
+            "prompt", "image_urls", "num_inference_steps", "seed",
+            "output_format", "enable_safety_checker",
+        },
+        "max_reference_images": 9,
    },
    "fal-ai/flux-2-pro": {
        "display": "FLUX 2 Pro",
@ -143,6 +151,14 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "safety_tolerance", "sync_mode", "seed",
        },
        "upscale": True,   # Backward-compat: current default behavior.
+        # Edit endpoint accepts up to 9 reference images.
+        "edit_endpoint": "fal-ai/flux-2-pro/edit",
+        "edit_supports": {
+            "prompt", "image_urls", "num_inference_steps", "guidance_scale",
+            "num_images", "output_format", "enable_safety_checker",
+            "safety_tolerance", "sync_mode", "seed",
+        },
+        "max_reference_images": 9,
    },
    "fal-ai/z-image/turbo": {
        "display": "Z-Image Turbo",
@ -194,6 +210,15 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "enable_web_search", "limit_generations",
        },
        "upscale": False,
+        # Nano Banana Pro edit (Gemini 3 Pro Image): natural-language edits
+        # with up to 2 reference images via `image_urls`.
+        "edit_endpoint": "fal-ai/nano-banana-pro/edit",
+        "edit_supports": {
+            "prompt", "image_urls", "aspect_ratio", "num_images",
+            "output_format", "safety_tolerance", "seed", "sync_mode",
+            "resolution", "enable_web_search", "limit_generations",
+        },
+        "max_reference_images": 2,
    },
    "fal-ai/gpt-image-1.5": {
        "display": "GPT Image 1.5",
@ -218,6 +243,13 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "background", "sync_mode",
        },
        "upscale": False,
+        # Edit endpoint: high-fidelity edits preserving composition/lighting.
+        "edit_endpoint": "fal-ai/gpt-image-1.5/edit",
+        "edit_supports": {
+            "prompt", "image_urls", "image_size", "quality", "num_images",
+            "output_format", "sync_mode",
+        },
+        "max_reference_images": 16,
    },
    "fal-ai/gpt-image-2": {
        "display": "GPT Image 2",
@ -250,6 +282,15 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            # through the shared FAL billing path.
        },
        "upscale": False,
+        # GPT Image 2 edit endpoint lives under the OpenAI namespace on FAL
+        # (NOT fal-ai/). Takes `image_urls` (list) + optional mask. We don't
+        # send `image_size` on edit so the model auto-infers from input.
+        "edit_endpoint": "openai/gpt-image-2/edit",
+        "edit_supports": {
+            "prompt", "image_urls", "quality", "num_images", "output_format",
+            "sync_mode", "mask_image_url",
+        },
+        "max_reference_images": 16,
    },
    "fal-ai/ideogram/v3": {
        "display": "Ideogram V3",
@ -272,6 +313,13 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "style", "seed",
        },
        "upscale": False,
+        # Ideogram V3 edit endpoint takes `image_urls` (list).
+        "edit_endpoint": "fal-ai/ideogram/v3/edit",
+        "edit_supports": {
+            "prompt", "image_urls", "rendering_speed", "expand_prompt",
+            "style", "seed",
+        },
+        "max_reference_images": 1,
    },
    "fal-ai/recraft/v4/pro/text-to-image": {
        "display": "Recraft V4 Pro",
@ -317,6 +365,14 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "num_images", "output_format", "acceleration", "seed", "sync_mode",
        },
        "upscale": False,
+        # Qwen edit uses the Qwen Image 2.0 Pro editing endpoint, which takes
+        # `image_urls` (list) + natural-language edit instructions.
+        "edit_endpoint": "fal-ai/qwen-image-2/pro/edit",
+        "edit_supports": {
+            "prompt", "image_urls", "num_inference_steps", "guidance_scale",
+            "num_images", "output_format", "acceleration", "seed", "sync_mode",
+        },
+        "max_reference_images": 3,
    },
    # Krea 2 — Krea's first foundation image model, day-0 partner launch on
    # fal (2026-05-27). Same model family as our direct ``plugins/image_gen/krea``
@ -554,6 +610,55 @@ def _build_fal_payload(
    return {k: v for k, v in payload.items() if k in supports}


+def _build_fal_edit_payload(
+    model_id: str,
+    prompt: str,
+    image_urls: list,
+    aspect_ratio: str = DEFAULT_ASPECT_RATIO,
+    seed: Optional[int] = None,
+    overrides: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """Build a FAL *edit* request payload (image-to-image) from unified inputs.
+
+    Every FAL edit endpoint takes ``image_urls`` (a list of source/reference
+    image URLs) plus the prompt. Size handling differs from text-to-image:
+    most edit endpoints auto-infer output dimensions from the input image, so
+    we only send ``image_size`` / ``aspect_ratio`` when the edit endpoint's
+    ``edit_supports`` whitelist accepts it. Keys outside ``edit_supports`` are
+    stripped before submission.
+    """
+    meta = FAL_MODELS[model_id]
+    edit_supports = meta.get("edit_supports") or set()
+    size_style = meta["size_style"]
+    sizes = meta["sizes"]
+
+    aspect = (aspect_ratio or DEFAULT_ASPECT_RATIO).lower().strip()
+    if aspect not in sizes:
+        aspect = DEFAULT_ASPECT_RATIO
+
+    payload: Dict[str, Any] = dict(meta.get("defaults", {}))
+    payload["prompt"] = (prompt or "").strip()
+    payload["image_urls"] = list(image_urls)
+
+    # Only express output size when the edit endpoint advertises the key.
+    # gpt-image-2 edit auto-infers size from the input, so `image_size` is
+    # intentionally absent from its edit_supports whitelist.
+    if size_style in {"image_size_preset", "gpt_literal"} and "image_size" in edit_supports:
+        payload["image_size"] = sizes[aspect]
+    elif size_style == "aspect_ratio" and "aspect_ratio" in edit_supports:
+        payload["aspect_ratio"] = sizes[aspect]
+
+    if seed is not None and isinstance(seed, int):
+        payload["seed"] = seed
+
+    if overrides:
+        for k, v in overrides.items():
+            if v is not None:
+                payload[k] = v
+
+    return {k: v for k, v in payload.items() if k in edit_supports}
+
+
 # ---------------------------------------------------------------------------
 # Upscaler
 # ---------------------------------------------------------------------------
@ -729,19 +834,39 @@ def image_generate_tool(
    num_images: Optional[int] = None,
    output_format: Optional[str] = None,
    seed: Optional[int] = None,
+    image_url: Optional[str] = None,
+    reference_image_urls: Optional[list] = None,
 ) -> str:
-    """Generate an image from a text prompt using the configured FAL model.
+    """Generate an image from a text prompt, or edit a source image, via FAL.

-    The agent-facing schema exposes only ``prompt`` and ``aspect_ratio``; the
-    remaining kwargs are overrides for direct Python callers and are filtered
-    per-model via the ``supports`` whitelist (unsupported overrides are
-    silently dropped so legacy callers don't break when switching models).
+    Routing: when ``image_url`` (or ``reference_image_urls``) is provided AND
+    the configured model declares an ``edit_endpoint``, the call routes to that
+    image-to-image / edit endpoint; otherwise it's plain text-to-image.
+
+    The agent-facing schema exposes ``prompt``, ``aspect_ratio``, ``image_url``
+    and ``reference_image_urls``; the remaining kwargs are overrides for direct
+    Python callers and are filtered per-model via the ``supports`` /
+    ``edit_supports`` whitelist (unsupported overrides are silently dropped so
+    legacy callers don't break when switching models).

    Returns a JSON string with ``{"success": bool, "image": url | None,
-    "error": str, "error_type": str}``.
+    "modality": "text" | "image", "error": str, "error_type": str}``.
    """
    model_id, meta = _resolve_fal_model()

+    # Collect any source images (primary + references) into one ordered list.
+    source_images: list = []
+    if isinstance(image_url, str) and image_url.strip():
+        source_images.append(image_url.strip())
+    if isinstance(reference_image_urls, (list, tuple)):
+        for ref in reference_image_urls:
+            if isinstance(ref, str) and ref.strip():
+                source_images.append(ref.strip())
+
+    edit_endpoint = meta.get("edit_endpoint")
+    use_edit = bool(source_images) and bool(edit_endpoint)
+    modality = "image" if use_edit else "text"
+
    debug_call_data = {
        "model": model_id,
        "parameters": {
@ -752,6 +877,8 @@ def image_generate_tool(
            "num_images": num_images,
            "output_format": output_format,
            "seed": seed,
+            "modality": modality,
+            "source_images": len(source_images),
        },
        "error": None,
        "success": False,
@ -768,6 +895,17 @@ def image_generate_tool(
        if not (fal_key_is_configured() or _resolve_managed_fal_gateway()):
            raise ValueError(_build_no_backend_setup_message())

+        # If the caller supplied source images but the active model has no
+        # edit endpoint, fail with a clear, actionable message instead of
+        # silently dropping the images and producing an unrelated picture.
+        if source_images and not edit_endpoint:
+            raise ValueError(
+                f"Model '{meta.get('display', model_id)}' ({model_id}) is not "
+                f"capable of image-to-image / editing. Provide a text-only "
+                f"prompt (omit image_url), or switch to an edit-capable model "
+                f"via `hermes tools` → Image Generation."
+            )
+
        aspect_lc = (aspect_ratio or DEFAULT_ASPECT_RATIO).lower().strip()
        if aspect_lc not in VALID_ASPECT_RATIOS:
            logger.warning(
@ -786,16 +924,31 @@ def image_generate_tool(
        if output_format is not None:
            overrides["output_format"] = output_format

-        arguments = _build_fal_payload(
-            model_id, prompt, aspect_lc, seed=seed, overrides=overrides,
-        )
+        if use_edit:
+            # Clamp reference count to the model's declared cap.
+            max_refs = int(meta.get("max_reference_images") or 1)
+            clamped_sources = source_images[:max_refs] if max_refs > 0 else source_images
+            arguments = _build_fal_edit_payload(
+                model_id, prompt, clamped_sources, aspect_lc,
+                seed=seed, overrides=overrides,
+            )
+            endpoint = edit_endpoint
+            logger.info(
+                "Editing image with %s (%s) — %d source image(s), prompt: %s",
+                meta.get("display", model_id), endpoint, len(clamped_sources),
+                prompt[:80],
+            )
+        else:
+            arguments = _build_fal_payload(
+                model_id, prompt, aspect_lc, seed=seed, overrides=overrides,
+            )
+            endpoint = model_id
+            logger.info(
+                "Generating image with %s (%s) — prompt: %s",
+                meta.get("display", model_id), model_id, prompt[:80],
+            )

-        logger.info(
-            "Generating image with %s (%s) — prompt: %s",
-            meta.get("display", model_id), model_id, prompt[:80],
-        )
-
-        handler = _submit_fal_request(model_id, arguments=arguments)
+        handler = _submit_fal_request(endpoint, arguments=arguments)
        result = handler.get()

        generation_time = (datetime.datetime.now() - start_time).total_seconds()
@ -807,7 +960,9 @@ def image_generate_tool(
        if not images:
            raise ValueError("No images were generated")

-        should_upscale = bool(meta.get("upscale", False))
+        # Edit endpoints already return the final composition; the Clarity
+        # upscaler is a text-to-image quality pass, so skip it for edits.
+        should_upscale = bool(meta.get("upscale", False)) and not use_edit

        formatted_images = []
        for img in images:
@ -834,13 +989,15 @@ def image_generate_tool(

        upscaled_count = sum(1 for img in formatted_images if img.get("upscaled"))
        logger.info(
-            "Generated %s image(s) in %.1fs (%s upscaled) via %s",
-            len(formatted_images), generation_time, upscaled_count, model_id,
+            "Generated %s image(s) in %.1fs (%s upscaled) via %s [%s]",
+            len(formatted_images), generation_time, upscaled_count, endpoint,
+            modality,
        )

        response_data = {
            "success": True,
            "image": formatted_images[0]["url"] if formatted_images else None,
+            "modality": modality,
        }

        debug_call_data["success"] = True
@ -1001,22 +1158,34 @@ from tools.registry import registry, tool_error

 IMAGE_GENERATE_SCHEMA = {
    "name": "image_generate",
+    # Placeholder — the real description is rebuilt dynamically at
+    # get_tool_definitions() time so it reflects the active backend's actual
+    # capabilities (whether the selected model supports image-to-image /
+    # editing). See _build_dynamic_image_schema() below and the
+    # dynamic-tool-schemas skill.
    "description": (
-        "Generate high-quality images from text prompts. The underlying "
-        "backend (FAL, OpenAI, etc.) and model are user-configured and not "
-        "selectable by the agent. Returns either a URL or an absolute file "
-        "path in the `image` field; display it with markdown "
-        "![description](url-or-path) and the gateway will deliver it. When "
-        "the active terminal backend has a different filesystem, successful "
-        "local-file results may also include `agent_visible_image` for "
-        "follow-up terminal/file operations."
+        "Generate high-quality images from text prompts (text-to-image), or "
+        "edit / transform an existing image (image-to-image) when the active "
+        "model supports it. Pass `image_url` to edit that image; add "
+        "`reference_image_urls` for style/composition references; omit both "
+        "for text-to-image. The underlying backend (FAL, OpenAI, xAI, etc.) "
+        "and model are user-configured and not selectable by the agent. "
+        "Returns either a URL or an absolute file path in the `image` field; "
+        "display it with markdown ![description](url-or-path) and the gateway "
+        "will deliver it. When the active terminal backend has a different "
+        "filesystem, successful local-file results may also include "
+        "`agent_visible_image` for follow-up terminal/file operations."
    ),
    "parameters": {
        "type": "object",
        "properties": {
            "prompt": {
                "type": "string",
-                "description": "The text prompt describing the desired image. Be detailed and descriptive.",
+                "description": (
+                    "The text prompt describing the desired image (text-to-"
+                    "image) or the edit to apply (image-to-image). Be detailed "
+                    "and descriptive."
+                ),
            },
            "aspect_ratio": {
                "type": "string",
@ -1024,6 +1193,28 @@ IMAGE_GENERATE_SCHEMA = {
                "description": "The aspect ratio of the generated image. 'landscape' is 16:9 wide, 'portrait' is 16:9 tall, 'square' is 1:1.",
                "default": DEFAULT_ASPECT_RATIO,
            },
+            "image_url": {
+                "type": "string",
+                "description": (
+                    "Optional source image to edit/transform (image-to-image). "
+                    "When provided, the active backend routes to its image "
+                    "editing endpoint; when omitted, it generates from text "
+                    "alone. Pass a public URL or an absolute local file path "
+                    "from the conversation. Only honored by models that "
+                    "support editing — the description above indicates whether "
+                    "the active model does."
+                ),
+            },
+            "reference_image_urls": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": (
+                    "Optional list of additional reference image URLs / paths "
+                    "(style, character, or composition references) to guide an "
+                    "image-to-image edit. Supported only by some models and "
+                    "capped per-model; the description above indicates the max."
+                ),
+            },
        },
        "required": ["prompt"],
    },
@ -1069,7 +1260,12 @@ def _read_configured_image_provider():
    return None


-def _dispatch_to_plugin_provider(prompt: str, aspect_ratio: str):
+def _dispatch_to_plugin_provider(
+    prompt: str,
+    aspect_ratio: str,
+    image_url: Optional[str] = None,
+    reference_image_urls: Optional[list] = None,
+):
    """Route the call to a plugin-registered provider when one is selected.

    Returns a JSON string on dispatch, or ``None`` to fall through to the
@ -1080,6 +1276,10 @@ def _dispatch_to_plugin_provider(prompt: str, aspect_ratio: str):
    ``plugins/image_gen/fal/`` plugin (the plugin re-enters this module's
    pipeline via ``_it`` indirection so behavior is identical to the
    direct call, just routed through the registry).
+
+    ``image_url`` / ``reference_image_urls`` enable image-to-image / editing:
+    they are forwarded to the provider's ``generate()`` so the backend can
+    route to its edit endpoint.
    """
    configured = _read_configured_image_provider()
    if not configured:
@ -1122,11 +1322,53 @@ def _dispatch_to_plugin_provider(prompt: str, aspect_ratio: str):
            "error_type": "provider_not_registered",
        })

+    kwargs: Dict[str, Any] = {"prompt": prompt, "aspect_ratio": aspect_ratio}
    try:
-        kwargs = {"prompt": prompt, "aspect_ratio": aspect_ratio}
        if configured_model:
            kwargs["model"] = configured_model
+        if isinstance(image_url, str) and image_url.strip():
+            kwargs["image_url"] = image_url.strip()
+        norm_refs = None
+        if reference_image_urls is not None:
+            from agent.image_gen_provider import normalize_reference_images
+
+            norm_refs = normalize_reference_images(reference_image_urls)
+        if norm_refs:
+            kwargs["reference_image_urls"] = norm_refs
        result = provider.generate(**kwargs)
+    except TypeError as exc:
+        # A provider whose generate() signature predates image_url support
+        # (third-party plugin not yet updated) — retry without the new kwargs
+        # so text-to-image keeps working, but surface a clear note when the
+        # user actually asked for an edit.
+        if "image_url" in kwargs or "reference_image_urls" in kwargs:
+            logger.warning(
+                "image_gen provider '%s' rejected image-to-image kwargs "
+                "(signature too narrow): %s",
+                getattr(provider, "name", "?"), exc,
+            )
+            return json.dumps({
+                "success": False,
+                "image": None,
+                "error": (
+                    f"Provider '{getattr(provider, 'name', '?')}' does not "
+                    f"support image-to-image / editing (its generate() "
+                    f"signature is out of date with the image_generate schema). "
+                    f"Omit image_url for text-to-image, or pick a backend that "
+                    f"supports editing via `hermes tools` → Image Generation."
+                ),
+                "error_type": "modality_unsupported",
+            })
+        logger.warning(
+            "Image gen provider '%s' raised TypeError: %s",
+            getattr(provider, "name", "?"), exc,
+        )
+        return json.dumps({
+            "success": False,
+            "image": None,
+            "error": f"Provider '{getattr(provider, 'name', '?')}' error: {exc}",
+            "error_type": "provider_exception",
+        })
    except Exception as exc:
        logger.warning(
            "Image gen provider '%s' raised: %s",
@ -1153,21 +1395,144 @@ def _handle_image_generate(args, **kw):
    if not prompt:
        return tool_error("prompt is required for image generation")
    aspect_ratio = args.get("aspect_ratio", DEFAULT_ASPECT_RATIO)
+    image_url = args.get("image_url")
+    reference_image_urls = args.get("reference_image_urls")
    task_id = kw.get("task_id")

    # Route to a plugin-registered provider if one is active (and it's
    # not the in-tree FAL path).
-    dispatched = _dispatch_to_plugin_provider(prompt, aspect_ratio)
+    dispatched = _dispatch_to_plugin_provider(
+        prompt, aspect_ratio,
+        image_url=image_url,
+        reference_image_urls=reference_image_urls,
+    )
    if dispatched is not None:
        return _postprocess_image_generate_result(dispatched, task_id=task_id)

    raw = image_generate_tool(
        prompt=prompt,
        aspect_ratio=aspect_ratio,
+        image_url=image_url,
+        reference_image_urls=reference_image_urls,
    )
    return _postprocess_image_generate_result(raw, task_id=task_id)


+# ---------------------------------------------------------------------------
+# Dynamic schema — reflect the active backend's image-to-image capability
+# ---------------------------------------------------------------------------
+#
+# Why dynamic: whether the active model supports image-to-image / editing
+# depends entirely on the user's configured backend + model. Telling the
+# model up front ("the active model is text-to-image only — image_url will be
+# rejected") saves a wasted turn. Memoized by config.yaml mtime in
+# model_tools.get_tool_definitions(), so it rebuilds when the user switches
+# model/provider via `hermes tools` or `/skills`.
+
+
+_GENERIC_IMAGE_DESCRIPTION = IMAGE_GENERATE_SCHEMA["description"]
+
+
+def _active_image_capabilities() -> Dict[str, Any]:
+    """Best-effort: return the active backend/model's image capabilities.
+
+    Resolution order mirrors the runtime dispatch:
+    1. If ``image_gen.provider`` is set, ask that plugin provider.
+    2. Otherwise inspect the in-tree FAL model catalog for the active model.
+
+    Returns a dict like ``{"modalities": [...], "max_reference_images": N,
+    "model": "...", "provider": "..."}``. Never raises.
+    """
+    info: Dict[str, Any] = {"modalities": ["text"], "max_reference_images": 0}
+
+    configured_provider = _read_configured_image_provider()
+    if configured_provider and configured_provider != "fal":
+        try:
+            from agent.image_gen_registry import get_provider
+            from hermes_cli.plugins import _ensure_plugins_discovered
+
+            _ensure_plugins_discovered()
+            provider = get_provider(configured_provider)
+            if provider is not None:
+                caps = {}
+                try:
+                    caps = provider.capabilities() or {}
+                except Exception:  # noqa: BLE001
+                    caps = {}
+                info["provider"] = provider.display_name
+                info["model"] = _read_configured_image_model() or (provider.default_model() or "")
+                if caps.get("modalities"):
+                    info["modalities"] = list(caps["modalities"])
+                if caps.get("max_reference_images"):
+                    info["max_reference_images"] = int(caps["max_reference_images"])
+                return info
+        except Exception:  # noqa: BLE001
+            pass
+
+    # In-tree FAL path (provider unset or == "fal").
+    try:
+        model_id, meta = _resolve_fal_model()
+        info["provider"] = "FAL.ai"
+        info["model"] = meta.get("display", model_id)
+        if meta.get("edit_endpoint"):
+            info["modalities"] = ["text", "image"]
+            info["max_reference_images"] = int(meta.get("max_reference_images") or 1)
+        else:
+            info["modalities"] = ["text"]
+            info["max_reference_images"] = 0
+    except Exception:  # noqa: BLE001
+        pass
+
+    return info
+
+
+def _build_dynamic_image_schema() -> Dict[str, Any]:
+    """Build a description reflecting whether the active model supports editing."""
+    parts = [_GENERIC_IMAGE_DESCRIPTION]
+
+    try:
+        info = _active_image_capabilities()
+    except Exception:  # noqa: BLE001
+        return {"description": _GENERIC_IMAGE_DESCRIPTION}
+
+    provider = info.get("provider")
+    model = info.get("model")
+    modalities = set(info.get("modalities") or ["text"])
+
+    line = "\nActive backend"
+    if provider:
+        line += f": {provider}"
+    if model:
+        line += f" · model: {model}"
+    parts.append(line)
+
+    if "image" in modalities and "text" in modalities:
+        max_refs = info.get("max_reference_images") or 0
+        ref_note = (
+            f"; up to {max_refs} reference image(s) via reference_image_urls"
+            if max_refs and max_refs > 1
+            else ""
+        )
+        parts.append(
+            "- supports both text-to-image (omit image_url) and "
+            f"image-to-image / editing (pass image_url){ref_note} — "
+            "routes automatically"
+        )
+    elif "image" in modalities and "text" not in modalities:
+        parts.append(
+            "- this model is image-to-image / edit only — image_url is REQUIRED"
+        )
+    else:
+        parts.append(
+            "- this model is text-to-image only — it is NOT capable of "
+            "image-to-image / editing; do not pass image_url or "
+            "reference_image_urls (they will be rejected). Provide a "
+            "text-only prompt."
+        )
+
+    return {"description": "\n".join(parts)}
+
+
 registry.register(
    name="image_generate",
    toolset="image_gen",
@ -1177,4 +1542,5 @@ registry.register(
    requires_env=[],
    is_async=False,   # sync fal_client API to avoid "Event loop is closed" in gateway
    emoji="🎨",
+    dynamic_schema_overrides=_build_dynamic_image_schema,
 )