mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-20 10:11:58 +00:00
* feat(image-gen): add image-to-image / editing to image_generate Brings image generation to parity with video generation: the unified image_generate tool now edits/transforms a source image (image-to-image) when given image_url / reference_image_urls, routing to each backend's edit endpoint, exactly as video_generate routes to image-to-video. - ImageGenProvider ABC: generate() gains keyword-only image_url + reference_image_urls; new capabilities() declares modalities + max_reference_images (defaults to text-only, backward compatible). success_response gains a modality field; adds normalize_reference_images. - image_generate tool: schema exposes image_url + reference_image_urls; dynamic schema reflects the active model's actual edit capability so the agent knows when image_url is honored. Handler + plugin dispatch forward the new inputs; legacy/text-only providers get a clear modality_unsupported error instead of silently dropping the source image. - In-tree FAL: 7 models gain edit endpoints (flux-2-klein, flux-2-pro, nano-banana-pro, gpt-image-1.5, gpt-image-2, ideogram/v3, qwen-image) with per-model edit_supports whitelists + reference caps; routes to the /edit endpoint and skips the upscaler for edits. - Plugins: openai (images.edit, 16 refs), xai (/v1/images/edits via grok-imagine-image-quality, JSON body per xAI docs), krea (image_style_references, 10 refs). openai-codex stays text-only and rejects edits with an actionable error. - Tests: 15 new (payload, routing, dispatch forwarding, dynamic schema, capabilities); updated 2 change-detector/lambda tests for the new schema. - Docs: image-generation feature page, image-gen provider plugin guide, tools reference. * fix(image-gen): preserve legacy passthrough in fal/krea plugin tests Two existing plugin tests asserted pre-image-to-image behavior: - fal: forward image_url/reference_image_urls only when supplied, so a text-to-image delegation stays byte-identical (no None kwargs). - krea: keep dict-shaped image_style_references refs verbatim (the unified string refs go through normalize_reference_images; legacy non-string ref objects pass through unchanged) — fixes KeyError when callers pass the richer Krea ref-object shape. * fix(image-gen): clearer not-capable message for text-to-image-only models When a text-to-image-only model (incl. gpt-image-2 on the Codex OAuth path, which can't do editing through the Responses image_generation tool) gets a source image, say 'this model is not capable of image-to-image / editing — provide a text-only prompt' rather than sending the user shopping for other backends. Applies to the openai-codex guard, the in-tree FAL no-edit-endpoint error, and the dynamic tool-schema text-only line.
211 lines
7.7 KiB
Python
211 lines
7.7 KiB
Python
"""FAL.ai image generation backend.
|
|
|
|
Wraps the 18-model FAL catalog (FLUX 2, Z-Image, Nano Banana, GPT
|
|
Image 1.5, Recraft, Imagen 4, Qwen, Ideogram, …) as an
|
|
:class:`ImageGenProvider` implementation.
|
|
|
|
The heavy lifting — model catalog, payload construction, request
|
|
submission, managed-Nous-gateway selection, Clarity Upscaler chaining
|
|
— lives in :mod:`tools.image_generation_tool`. This plugin reaches into
|
|
that module via call-time indirection (``import tools.image_generation_tool as _it``)
|
|
so:
|
|
|
|
* the existing test suite (``tests/tools/test_image_generation.py``,
|
|
``tests/tools/test_managed_media_gateways.py``) keeps patching
|
|
``image_tool._submit_fal_request`` / ``image_tool.fal_client`` /
|
|
``image_tool._managed_fal_client`` without modification, and
|
|
* there's exactly one canonical FAL code path on disk — the plugin is a
|
|
registration adapter, not a parallel implementation.
|
|
|
|
See issue #26241 for the migration plan and the
|
|
``plugin-extraction-test-patch-compatibility.md`` rules this follows.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from agent.image_gen_provider import (
|
|
DEFAULT_ASPECT_RATIO,
|
|
ImageGenProvider,
|
|
resolve_aspect_ratio,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class FalImageGenProvider(ImageGenProvider):
|
|
"""FAL.ai image generation backend.
|
|
|
|
Delegates to ``tools.image_generation_tool.image_generate_tool`` so
|
|
the in-tree FAL implementation (model catalog, payload builder,
|
|
managed-gateway selection, Clarity Upscaler chaining) is the single
|
|
source of truth. Everything is resolved at call time via the
|
|
``_it`` indirection so tests can monkey-patch the legacy module.
|
|
"""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "fal"
|
|
|
|
@property
|
|
def display_name(self) -> str:
|
|
return "FAL.ai"
|
|
|
|
def is_available(self) -> bool:
|
|
# Available when direct FAL_KEY is set OR the managed Nous
|
|
# gateway resolves a fal-queue origin. Both checks come from the
|
|
# legacy module so this provider tracks whatever logic ships
|
|
# there.
|
|
import tools.image_generation_tool as _it
|
|
try:
|
|
return bool(_it.check_fal_api_key())
|
|
except Exception: # noqa: BLE001 — defensive; never break the picker
|
|
return False
|
|
|
|
def list_models(self) -> List[Dict[str, Any]]:
|
|
import tools.image_generation_tool as _it
|
|
return [
|
|
{
|
|
"id": model_id,
|
|
"display": meta.get("display", model_id),
|
|
"speed": meta.get("speed", ""),
|
|
"strengths": meta.get("strengths", ""),
|
|
"price": meta.get("price", ""),
|
|
}
|
|
for model_id, meta in _it.FAL_MODELS.items()
|
|
]
|
|
|
|
def default_model(self) -> Optional[str]:
|
|
import tools.image_generation_tool as _it
|
|
return _it.DEFAULT_MODEL
|
|
|
|
def get_setup_schema(self) -> Dict[str, Any]:
|
|
return {
|
|
"name": "FAL.ai",
|
|
"badge": "paid",
|
|
"tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc. — text-to-image & image editing",
|
|
"env_vars": [
|
|
{
|
|
"key": "FAL_KEY",
|
|
"prompt": "FAL API key",
|
|
"url": "https://fal.ai/dashboard/keys",
|
|
},
|
|
],
|
|
}
|
|
|
|
def capabilities(self) -> Dict[str, Any]:
|
|
# Whether image-to-image is available depends on the currently-
|
|
# selected FAL model (each model entry declares an edit_endpoint or
|
|
# not). Report the active model's actual surface so the dynamic tool
|
|
# schema is accurate.
|
|
import tools.image_generation_tool as _it
|
|
|
|
try:
|
|
_model_id, meta = _it._resolve_fal_model()
|
|
except Exception: # noqa: BLE001
|
|
return {"modalities": ["text"], "max_reference_images": 0}
|
|
if meta.get("edit_endpoint"):
|
|
return {
|
|
"modalities": ["text", "image"],
|
|
"max_reference_images": int(meta.get("max_reference_images") or 1),
|
|
}
|
|
return {"modalities": ["text"], "max_reference_images": 0}
|
|
|
|
def generate(
|
|
self,
|
|
prompt: str,
|
|
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
|
*,
|
|
image_url: Optional[str] = None,
|
|
reference_image_urls: Optional[List[str]] = None,
|
|
**kwargs: Any,
|
|
) -> Dict[str, Any]:
|
|
"""Generate or edit an image via the legacy FAL pipeline.
|
|
|
|
Forwards prompt + aspect_ratio + image_url/reference_image_urls (and
|
|
any forward-compat extras the schema supports) into
|
|
:func:`tools.image_generation_tool.image_generate_tool`, then reshapes
|
|
its JSON-string response into the provider-ABC dict format consumed by
|
|
``_dispatch_to_plugin_provider``.
|
|
"""
|
|
import tools.image_generation_tool as _it
|
|
|
|
aspect = resolve_aspect_ratio(aspect_ratio)
|
|
passthrough = {
|
|
key: kwargs[key]
|
|
for key in (
|
|
"num_inference_steps",
|
|
"guidance_scale",
|
|
"num_images",
|
|
"output_format",
|
|
"seed",
|
|
)
|
|
if key in kwargs and kwargs[key] is not None
|
|
}
|
|
# Only forward the image-to-image inputs when actually supplied, so a
|
|
# plain text-to-image call delegates exactly as it did before (no
|
|
# noisy None kwargs).
|
|
if image_url is not None:
|
|
passthrough["image_url"] = image_url
|
|
if reference_image_urls is not None:
|
|
passthrough["reference_image_urls"] = reference_image_urls
|
|
|
|
try:
|
|
raw = _it.image_generate_tool(
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
**passthrough,
|
|
)
|
|
except Exception as exc: # noqa: BLE001 — never raise out of generate
|
|
logger.warning("FAL image_generate_tool raised: %s", exc, exc_info=True)
|
|
return {
|
|
"success": False,
|
|
"image": None,
|
|
"error": f"FAL image generation failed: {exc}",
|
|
"error_type": type(exc).__name__,
|
|
"provider": "fal",
|
|
"prompt": prompt,
|
|
"aspect_ratio": aspect,
|
|
}
|
|
|
|
try:
|
|
response = json.loads(raw) if isinstance(raw, str) else raw
|
|
except Exception: # noqa: BLE001
|
|
response = {"success": False, "image": None, "error": "Invalid JSON from FAL pipeline"}
|
|
|
|
if not isinstance(response, dict):
|
|
response = {
|
|
"success": False,
|
|
"image": None,
|
|
"error": "FAL pipeline returned a non-dict response",
|
|
"error_type": "provider_contract",
|
|
}
|
|
|
|
# Stamp provider/prompt/aspect_ratio so downstream consumers see
|
|
# the uniform shape declared in ``agent.image_gen_provider``.
|
|
response.setdefault("provider", "fal")
|
|
response.setdefault("prompt", prompt)
|
|
response.setdefault("aspect_ratio", aspect)
|
|
# Annotate model best-effort — the legacy pipeline resolves it
|
|
# internally, so query it after the fact for the response shape.
|
|
if "model" not in response:
|
|
try:
|
|
model_id, _meta = _it._resolve_fal_model()
|
|
response["model"] = model_id
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
return response
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Plugin entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def register(ctx) -> None:
|
|
"""Plugin entry point — wire ``FalImageGenProvider`` into the registry."""
|
|
ctx.register_image_gen_provider(FalImageGenProvider())
|