""" Image Generation Provider ABC ============================= Defines the pluggable-backend interface for image generation. Providers register instances via ``PluginContext.register_image_gen_provider()``; the active one (selected via ``image_gen.provider`` in ``config.yaml``) services every ``image_generate`` tool call. Providers live in ``/plugins/image_gen//`` (built-in, auto-loaded as ``kind: backend``) or ``~/.hermes/plugins/image_gen//`` (user, opt-in via ``plugins.enabled``). Unified surface --------------- One tool — ``image_generate`` — covers **text-to-image** and **image-to-image / image editing**. The router is the presence of ``image_url`` (and/or ``reference_image_urls``): if any source image is provided, the provider routes to its image-to-image / edit endpoint; if omitted, the provider routes to text-to-image. Users pick one **model** (e.g. nano-banana-pro, gpt-image-2, grok-imagine-image); the provider handles which underlying endpoint to hit. This mirrors the ``video_gen`` provider design (``agent/video_gen_provider.py``) so the two surfaces stay learnable together. Response shape -------------- All providers return a dict that :func:`success_response` / :func:`error_response` produce. The tool wrapper JSON-serializes it. Keys: success bool image str | None URL or absolute file path model str provider-specific model identifier prompt str echoed prompt aspect_ratio str "landscape" | "square" | "portrait" modality str "text" | "image" (which mode was used) provider str provider name (for diagnostics) error str only when success=False error_type str only when success=False """ from __future__ import annotations import abc import base64 import datetime import logging import uuid from pathlib import Path from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) VALID_ASPECT_RATIOS: Tuple[str, ...] = ("landscape", "square", "portrait") DEFAULT_ASPECT_RATIO = "landscape" # --------------------------------------------------------------------------- # ABC # --------------------------------------------------------------------------- class ImageGenProvider(abc.ABC): """Abstract base class for an image generation backend. Subclasses must implement :meth:`generate`. Everything else has sane defaults — override only what your provider needs. """ @property @abc.abstractmethod def name(self) -> str: """Stable short identifier used in ``image_gen.provider`` config. Lowercase, no spaces. Examples: ``fal``, ``openai``, ``replicate``. """ @property def display_name(self) -> str: """Human-readable label shown in ``hermes tools``. Defaults to ``name.title()``.""" return self.name.title() def is_available(self) -> bool: """Return True when this provider can service calls. Typically checks for a required API key. Default: True (providers with no external dependencies are always available). """ return True def list_models(self) -> List[Dict[str, Any]]: """Return catalog entries for ``hermes tools`` model picker. Each entry:: { "id": "gpt-image-1.5", # required "display": "GPT Image 1.5", # optional; defaults to id "speed": "~10s", # optional "strengths": "...", # optional "price": "$...", # optional } Default: empty list (provider has no user-selectable models). """ return [] def get_setup_schema(self) -> Dict[str, Any]: """Return provider metadata for the ``hermes tools`` picker. Used by ``tools_config.py`` to inject this provider as a row in the Image Generation provider list. Shape:: { "name": "OpenAI", # picker label "badge": "paid", # optional short tag "tag": "One-line description...", # optional subtitle "env_vars": [ # keys to prompt for {"key": "OPENAI_API_KEY", "prompt": "OpenAI API key", "url": "https://platform.openai.com/api-keys"}, ], } Default: minimal entry derived from ``display_name``. Override to expose API key prompts and custom badges. """ return { "name": self.display_name, "badge": "", "tag": "", "env_vars": [], } def default_model(self) -> Optional[str]: """Return the default model id, or None if not applicable.""" models = self.list_models() if models: return models[0].get("id") return None def capabilities(self) -> Dict[str, Any]: """Return what this provider supports. Returned dict (all keys optional):: { "modalities": ["text", "image"], # which inputs the backend accepts "max_reference_images": 9, # cap for reference_image_urls } ``modalities`` declares whether the active backend/model supports text-to-image (``"text"``), image-to-image / editing (``"image"``), or both. The tool layer surfaces this in the dynamic schema so the model knows when ``image_url`` is honored. Used by ``hermes tools`` for the picker too. Default: text-only (backward compatible — a provider that doesn't override this advertises text-to-image only). """ return { "modalities": ["text"], "max_reference_images": 0, } @abc.abstractmethod def generate( self, prompt: str, aspect_ratio: str = DEFAULT_ASPECT_RATIO, *, image_url: Optional[str] = None, reference_image_urls: Optional[List[str]] = None, **kwargs: Any, ) -> Dict[str, Any]: """Generate an image from a text prompt, or edit/transform a source image. Routing: if ``image_url`` (or any ``reference_image_urls``) is provided, the provider should route to its image-to-image / edit endpoint; otherwise text-to-image. ``image_url`` is the primary source image to edit; ``reference_image_urls`` are additional style/composition references (provider clamps to its declared ``max_reference_images``). Implementations should return the dict from :func:`success_response` or :func:`error_response`. ``kwargs`` may contain forward-compat parameters future versions of the schema will expose — implementations MUST ignore unknown keys (no TypeError). """ # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def resolve_aspect_ratio(value: Optional[str]) -> str: """Clamp an aspect_ratio value to the valid set, defaulting to landscape. Invalid values are coerced rather than rejected so the tool surface is forgiving of agent mistakes. """ if not isinstance(value, str): return DEFAULT_ASPECT_RATIO v = value.strip().lower() if v in VALID_ASPECT_RATIOS: return v return DEFAULT_ASPECT_RATIO def normalize_reference_images(value: Any) -> Optional[List[str]]: """Coerce a reference-image argument into a clean list of URL/path strings. Accepts a single string or a list; strips blanks and whitespace. Returns ``None`` when nothing usable remains so providers can treat "no refs" as a single sentinel. """ if value is None: return None if isinstance(value, str): value = [value] if not isinstance(value, (list, tuple)): return None out: List[str] = [] for item in value: if isinstance(item, str) and item.strip(): out.append(item.strip()) return out or None def _images_cache_dir() -> Path: """Return ``$HERMES_HOME/cache/images/``, creating parents as needed.""" from hermes_constants import get_hermes_home path = get_hermes_home() / "cache" / "images" path.mkdir(parents=True, exist_ok=True) return path def save_b64_image( b64_data: str, *, prefix: str = "image", extension: str = "png", ) -> Path: """Decode base64 image data and write it under ``$HERMES_HOME/cache/images/``. Returns the absolute :class:`Path` to the saved file. Filename format: ``__.``. """ raw = base64.b64decode(b64_data) ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") short = uuid.uuid4().hex[:8] path = _images_cache_dir() / f"{prefix}_{ts}_{short}.{extension}" path.write_bytes(raw) return path # Extension inference for save_url_image — keep small and explicit. We don't # want to import mimetypes for a handful of formats every image_gen provider # actually returns, and we never want to inherit a content-type that points # at HTML or JSON when the API gives us a degenerate response. _URL_IMAGE_CONTENT_TYPES = { "image/png": "png", "image/jpeg": "jpg", "image/jpg": "jpg", "image/webp": "webp", "image/gif": "gif", } def save_url_image( url: str, *, prefix: str = "image", timeout: float = 60.0, max_bytes: int = 25 * 1024 * 1024, ) -> Path: """Download an image URL and write it under ``$HERMES_HOME/cache/images/``. Used by providers (xAI, fallback OpenAI) whose API returns an *ephemeral* URL instead of inline base64 — those URLs frequently expire before a downstream consumer (Telegram ``send_photo``, browser fetch) can resolve them, so we materialise the bytes locally at tool-completion time. Mirrors :func:`save_b64_image`'s shape so providers can swap in one line. Returns the absolute :class:`Path` to the saved file. Raises on any network / HTTP / oversize / non-image-content-type error so callers can fall back to returning the bare URL with a clear error message. """ import requests response = requests.get(url, timeout=timeout, stream=True) response.raise_for_status() # Infer extension from the response content-type, falling back to the # URL suffix when xAI / OpenAI omit a precise type (some CDNs return # ``application/octet-stream``). Defaults to ``png``. content_type = (response.headers.get("Content-Type") or "").split(";", 1)[0].strip().lower() extension = _URL_IMAGE_CONTENT_TYPES.get(content_type) if extension is None: url_path = url.split("?", 1)[0].lower() for ext in ("png", "jpg", "jpeg", "webp", "gif"): if url_path.endswith(f".{ext}"): extension = "jpg" if ext == "jpeg" else ext break if extension is None: extension = "png" ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") short = uuid.uuid4().hex[:8] path = _images_cache_dir() / f"{prefix}_{ts}_{short}.{extension}" bytes_written = 0 with path.open("wb") as fh: for chunk in response.iter_content(chunk_size=64 * 1024): if not chunk: continue bytes_written += len(chunk) if bytes_written > max_bytes: fh.close() try: path.unlink() except OSError: pass raise ValueError( f"Image at {url} exceeds {max_bytes // (1024 * 1024)}MB cap; refusing to cache." ) fh.write(chunk) if bytes_written == 0: try: path.unlink() except OSError: pass raise ValueError(f"Image at {url} returned 0 bytes; refusing to cache.") return path def success_response( *, image: str, model: str, prompt: str, aspect_ratio: str, provider: str, modality: str = "text", extra: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Build a uniform success response dict. ``image`` may be an HTTP URL or an absolute filesystem path (for b64 providers like OpenAI). ``modality`` is ``"text"`` (text-to-image) or ``"image"`` (image-to-image / editing) — indicates which endpoint was actually hit, useful for diagnostics. Callers that need to pass through additional backend-specific fields can supply ``extra``. """ payload: Dict[str, Any] = { "success": True, "image": image, "model": model, "prompt": prompt, "aspect_ratio": aspect_ratio, "modality": modality, "provider": provider, } if extra: for k, v in extra.items(): payload.setdefault(k, v) return payload def error_response( *, error: str, error_type: str = "provider_error", provider: str = "", model: str = "", prompt: str = "", aspect_ratio: str = DEFAULT_ASPECT_RATIO, ) -> Dict[str, Any]: """Build a uniform error response dict.""" return { "success": False, "image": None, "error": error, "error_type": error_type, "model": model, "prompt": prompt, "aspect_ratio": aspect_ratio, "provider": provider, }