hermes-agent/tests/tools/test_image_generation_image_to_image.py
Teknium c02192ff6a
feat(image-gen): add image-to-image / editing to image_generate (#48705)
* feat(image-gen): add image-to-image / editing to image_generate

Brings image generation to parity with video generation: the unified
image_generate tool now edits/transforms a source image (image-to-image)
when given image_url / reference_image_urls, routing to each backend's
edit endpoint, exactly as video_generate routes to image-to-video.

- ImageGenProvider ABC: generate() gains keyword-only image_url +
  reference_image_urls; new capabilities() declares modalities +
  max_reference_images (defaults to text-only, backward compatible).
  success_response gains a modality field; adds normalize_reference_images.
- image_generate tool: schema exposes image_url + reference_image_urls;
  dynamic schema reflects the active model's actual edit capability so the
  agent knows when image_url is honored. Handler + plugin dispatch forward
  the new inputs; legacy/text-only providers get a clear modality_unsupported
  error instead of silently dropping the source image.
- In-tree FAL: 7 models gain edit endpoints (flux-2-klein, flux-2-pro,
  nano-banana-pro, gpt-image-1.5, gpt-image-2, ideogram/v3, qwen-image)
  with per-model edit_supports whitelists + reference caps; routes to the
  /edit endpoint and skips the upscaler for edits.
- Plugins: openai (images.edit, 16 refs), xai (/v1/images/edits via
  grok-imagine-image-quality, JSON body per xAI docs), krea
  (image_style_references, 10 refs). openai-codex stays text-only and
  rejects edits with an actionable error.
- Tests: 15 new (payload, routing, dispatch forwarding, dynamic schema,
  capabilities); updated 2 change-detector/lambda tests for the new schema.
- Docs: image-generation feature page, image-gen provider plugin guide,
  tools reference.

* fix(image-gen): preserve legacy passthrough in fal/krea plugin tests

Two existing plugin tests asserted pre-image-to-image behavior:
- fal: forward image_url/reference_image_urls only when supplied, so a
  text-to-image delegation stays byte-identical (no None kwargs).
- krea: keep dict-shaped image_style_references refs verbatim (the unified
  string refs go through normalize_reference_images; legacy non-string ref
  objects pass through unchanged) — fixes KeyError when callers pass the
  richer Krea ref-object shape.

* fix(image-gen): clearer not-capable message for text-to-image-only models

When a text-to-image-only model (incl. gpt-image-2 on the Codex OAuth path,
which can't do editing through the Responses image_generation tool) gets a
source image, say 'this model is not capable of image-to-image / editing —
provide a text-only prompt' rather than sending the user shopping for other
backends. Applies to the openai-codex guard, the in-tree FAL no-edit-endpoint
error, and the dynamic tool-schema text-only line.
2026-06-18 22:13:07 -07:00

349 lines
14 KiB
Python

"""Tests for the image-to-image / editing surface of ``image_generate``.
Mirrors the video-gen image-to-video tests: the unified ``image_generate``
tool routes to a provider's edit endpoint when ``image_url`` /
``reference_image_urls`` is supplied, otherwise to text-to-image. Coverage:
- In-tree FAL edit payload construction (``_build_fal_edit_payload``)
- In-tree FAL routing (text vs edit endpoint) via ``image_generate_tool``
- Plugin dispatch forwards image_url / reference_image_urls to ``generate()``
- ``capabilities()`` honesty drives the dynamic tool-schema description
- Models without an edit endpoint reject image inputs with a clear error
"""
from __future__ import annotations
import json
from typing import Any, Dict, List, Optional
import pytest
import yaml
from agent import image_gen_registry
from agent.image_gen_provider import ImageGenProvider
@pytest.fixture(autouse=True)
def _reset_registry():
image_gen_registry._reset_for_tests()
yield
image_gen_registry._reset_for_tests()
@pytest.fixture
def cfg_home(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
return tmp_path
def _write_cfg(home, cfg: dict):
(home / "config.yaml").write_text(yaml.safe_dump(cfg))
# ---------------------------------------------------------------------------
# In-tree FAL edit payload + routing
# ---------------------------------------------------------------------------
class TestFalEditPayload:
def test_edit_payload_includes_image_urls(self):
from tools.image_generation_tool import _build_fal_edit_payload
payload = _build_fal_edit_payload(
"fal-ai/nano-banana-pro", "make it night", ["https://x/y.png"],
"landscape",
)
assert payload["prompt"] == "make it night"
assert payload["image_urls"] == ["https://x/y.png"]
# nano-banana edit advertises aspect_ratio in edit_supports
assert payload.get("aspect_ratio") == "16:9"
def test_edit_payload_strips_keys_outside_edit_supports(self):
from tools.image_generation_tool import _build_fal_edit_payload
# gpt-image-2 edit does NOT advertise image_size (auto-inferred), so
# it must be stripped even though the text-to-image path sets it.
payload = _build_fal_edit_payload(
"fal-ai/gpt-image-2", "swap bg", ["https://x/y.png"], "square",
)
assert "image_size" not in payload
assert payload["image_urls"] == ["https://x/y.png"]
assert payload["quality"] == "medium"
def test_text_only_model_has_no_edit_endpoint(self):
from tools.image_generation_tool import FAL_MODELS
# z-image/turbo is a pure text-to-image model — no edit endpoint.
assert "edit_endpoint" not in FAL_MODELS["fal-ai/z-image/turbo"]
# while nano-banana-pro is edit-capable
assert FAL_MODELS["fal-ai/nano-banana-pro"].get("edit_endpoint")
class TestFalRouting:
def _patch_submit(self, monkeypatch, image_tool, capture: dict):
class _Handler:
def get(self_inner):
return {"images": [{"url": "https://out/img.png", "width": 1, "height": 1}]}
def fake_submit(endpoint, arguments):
capture["endpoint"] = endpoint
capture["arguments"] = arguments
return _Handler()
monkeypatch.setattr(image_tool, "_submit_fal_request", fake_submit)
monkeypatch.setattr(image_tool, "fal_key_is_configured", lambda: True)
monkeypatch.setattr(image_tool, "_resolve_managed_fal_gateway", lambda: None)
def test_text_to_image_uses_base_endpoint(self, cfg_home, monkeypatch):
import tools.image_generation_tool as image_tool
_write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/nano-banana-pro"}})
capture: dict = {}
self._patch_submit(monkeypatch, image_tool, capture)
raw = image_tool.image_generate_tool(prompt="a cat", aspect_ratio="square")
out = json.loads(raw)
assert out["success"] is True
assert out["modality"] == "text"
assert capture["endpoint"] == "fal-ai/nano-banana-pro"
assert "image_urls" not in capture["arguments"]
def test_image_to_image_routes_to_edit_endpoint(self, cfg_home, monkeypatch):
import tools.image_generation_tool as image_tool
_write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/nano-banana-pro"}})
capture: dict = {}
self._patch_submit(monkeypatch, image_tool, capture)
raw = image_tool.image_generate_tool(
prompt="make it night",
aspect_ratio="square",
image_url="https://in/src.png",
)
out = json.loads(raw)
assert out["success"] is True
assert out["modality"] == "image"
assert capture["endpoint"] == "fal-ai/nano-banana-pro/edit"
assert capture["arguments"]["image_urls"] == ["https://in/src.png"]
def test_reference_images_clamped_to_model_cap(self, cfg_home, monkeypatch):
import tools.image_generation_tool as image_tool
# nano-banana-pro caps at 2 reference images.
_write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/nano-banana-pro"}})
capture: dict = {}
self._patch_submit(monkeypatch, image_tool, capture)
raw = image_tool.image_generate_tool(
prompt="blend",
image_url="https://in/a.png",
reference_image_urls=["https://in/b.png", "https://in/c.png", "https://in/d.png"],
)
out = json.loads(raw)
assert out["success"] is True
assert capture["arguments"]["image_urls"] == ["https://in/a.png", "https://in/b.png"]
def test_text_only_model_rejects_image_url(self, cfg_home, monkeypatch):
import tools.image_generation_tool as image_tool
_write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/z-image/turbo"}})
capture: dict = {}
self._patch_submit(monkeypatch, image_tool, capture)
raw = image_tool.image_generate_tool(
prompt="edit this", image_url="https://in/src.png",
)
out = json.loads(raw)
assert out["success"] is False
assert "image-to-image" in out["error"]
# Must NOT have submitted anything.
assert capture == {}
def test_edit_skips_upscaler(self, cfg_home, monkeypatch):
import tools.image_generation_tool as image_tool
# flux-2-pro has upscale=True for text-to-image, but edits must skip it.
_write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/flux-2-pro"}})
capture: dict = {}
self._patch_submit(monkeypatch, image_tool, capture)
upscale_called = {"hit": False}
monkeypatch.setattr(
image_tool, "_upscale_image",
lambda *a, **k: upscale_called.__setitem__("hit", True) or None,
)
raw = image_tool.image_generate_tool(
prompt="tweak", image_url="https://in/src.png",
)
out = json.loads(raw)
assert out["success"] is True
assert out["modality"] == "image"
assert upscale_called["hit"] is False
# ---------------------------------------------------------------------------
# Plugin dispatch forwarding
# ---------------------------------------------------------------------------
class _EditCapableProvider(ImageGenProvider):
def __init__(self):
self.received: Dict[str, Any] = {}
@property
def name(self) -> str:
return "editcap"
def capabilities(self) -> Dict[str, Any]:
return {"modalities": ["text", "image"], "max_reference_images": 4}
def generate(self, prompt, aspect_ratio="landscape", *, image_url=None,
reference_image_urls=None, **kwargs):
self.received = {
"prompt": prompt,
"aspect_ratio": aspect_ratio,
"image_url": image_url,
"reference_image_urls": reference_image_urls,
}
return {
"success": True, "image": "/tmp/out.png", "model": "editcap-1",
"prompt": prompt, "aspect_ratio": aspect_ratio,
"modality": "image" if image_url else "text", "provider": "editcap",
}
class _LegacyProvider(ImageGenProvider):
"""Provider whose generate() predates image_url (no **kwargs absorb)."""
@property
def name(self) -> str:
return "legacy"
def generate(self, prompt, aspect_ratio="landscape"): # narrow signature
return {"success": True, "image": "/tmp/legacy.png", "provider": "legacy"}
class TestPluginDispatchImageToImage:
def test_dispatch_forwards_image_url(self, cfg_home, monkeypatch):
import tools.image_generation_tool as image_tool
from hermes_cli import plugins as plugins_module
from agent import image_gen_registry as reg
provider = _EditCapableProvider()
reg.register_provider(provider)
monkeypatch.setattr(image_tool, "_read_configured_image_provider", lambda: "editcap")
monkeypatch.setattr(plugins_module, "_ensure_plugins_discovered", lambda *a, **k: None)
monkeypatch.setattr(reg, "get_provider", lambda n: provider if n == "editcap" else None)
raw = image_tool._dispatch_to_plugin_provider(
"make night", "square",
image_url="https://in/src.png",
reference_image_urls=["https://in/ref.png"],
)
out = json.loads(raw)
assert out["success"] is True
assert out["modality"] == "image"
assert provider.received["image_url"] == "https://in/src.png"
assert provider.received["reference_image_urls"] == ["https://in/ref.png"]
def test_dispatch_text_only_when_no_image(self, cfg_home, monkeypatch):
import tools.image_generation_tool as image_tool
from hermes_cli import plugins as plugins_module
from agent import image_gen_registry as reg
provider = _EditCapableProvider()
reg.register_provider(provider)
monkeypatch.setattr(image_tool, "_read_configured_image_provider", lambda: "editcap")
monkeypatch.setattr(plugins_module, "_ensure_plugins_discovered", lambda *a, **k: None)
monkeypatch.setattr(reg, "get_provider", lambda n: provider if n == "editcap" else None)
raw = image_tool._dispatch_to_plugin_provider("a dog", "landscape")
out = json.loads(raw)
assert out["success"] is True
assert provider.received["image_url"] is None
assert "reference_image_urls" not in provider.received or provider.received["reference_image_urls"] is None
def test_legacy_provider_edit_request_surfaces_clear_error(self, cfg_home, monkeypatch):
import tools.image_generation_tool as image_tool
from hermes_cli import plugins as plugins_module
from agent import image_gen_registry as reg
provider = _LegacyProvider()
reg.register_provider(provider)
monkeypatch.setattr(image_tool, "_read_configured_image_provider", lambda: "legacy")
monkeypatch.setattr(plugins_module, "_ensure_plugins_discovered", lambda *a, **k: None)
monkeypatch.setattr(reg, "get_provider", lambda n: provider if n == "legacy" else None)
raw = image_tool._dispatch_to_plugin_provider(
"edit it", "square", image_url="https://in/src.png",
)
out = json.loads(raw)
assert out["success"] is False
assert out["error_type"] == "modality_unsupported"
# ---------------------------------------------------------------------------
# Dynamic schema reflects active capabilities
# ---------------------------------------------------------------------------
class _PluginBothProvider(ImageGenProvider):
@property
def name(self) -> str:
return "both"
def is_available(self) -> bool:
return True
def default_model(self) -> Optional[str]:
return "both-v1"
def capabilities(self) -> Dict[str, Any]:
return {"modalities": ["text", "image"], "max_reference_images": 5}
def generate(self, prompt, aspect_ratio="landscape", *, image_url=None,
reference_image_urls=None, **kwargs):
return {"success": True}
class TestDynamicSchema:
def _no_discovery(self, monkeypatch):
import hermes_cli.plugins as plugins_module
monkeypatch.setattr(plugins_module, "_ensure_plugins_discovered", lambda *a, **k: None)
def test_fal_edit_model_advertises_both(self, cfg_home, monkeypatch):
from tools.image_generation_tool import _build_dynamic_image_schema
_write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/nano-banana-pro"}})
desc = _build_dynamic_image_schema()["description"]
assert "text-to-image" in desc and "image-to-image" in desc
assert "routes automatically" in desc
def test_fal_text_only_model_warns(self, cfg_home, monkeypatch):
from tools.image_generation_tool import _build_dynamic_image_schema
_write_cfg(cfg_home, {"image_gen": {"model": "fal-ai/z-image/turbo"}})
desc = _build_dynamic_image_schema()["description"]
assert "text-to-image only" in desc
assert "NOT capable of image-to-image" in desc
def test_plugin_both_provider_advertises_refs(self, cfg_home, monkeypatch):
from tools.image_generation_tool import _build_dynamic_image_schema
from agent import image_gen_registry as reg
_write_cfg(cfg_home, {"image_gen": {"provider": "both"}})
reg.register_provider(_PluginBothProvider())
self._no_discovery(monkeypatch)
desc = _build_dynamic_image_schema()["description"]
assert "image-to-image / editing" in desc
assert "up to 5 reference image(s)" in desc
def test_builder_wired_into_registry(self):
from tools.registry import discover_builtin_tools, registry
discover_builtin_tools()
entry = registry._tools["image_generate"]
assert entry.dynamic_schema_overrides is not None
out = entry.dynamic_schema_overrides()
assert "description" in out