mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
The quality-first default (OpenAI image via OpenRouter) is slow, and a full hatch fans out ~8 rows with up to 3 retries each (300s/call) across 2 parallel waves, so the absolute backend worst case is ~30 min. The old ceilings fired mid-run: - per-image HTTP call: 180s -> 300s (a single cold row can exceed 3 min) - drafts RPC: 240s -> 420s (single wave, no retries — 7 min is ample) - hatch RPC: 420s -> 1hr (sits above the ~30 min backend worst case) The hatch ceiling is intentionally well above the realistic max so the frontend never throws "request timed out" before the backend has exhausted its own retries. The background-resumable notification path remains the real UX safety net — the user can close the modal and get pinged on completion.
511 lines
19 KiB
Python
511 lines
19 KiB
Python
"""OpenRouter-compatible image generation backend (OpenRouter + Nous Portal).
|
|
|
|
Both OpenRouter and the Nous Portal inference endpoint speak the same
|
|
OpenAI-style ``/chat/completions`` image-generation protocol: send
|
|
``modalities: ["image", "text"]`` with an image-output model (e.g.
|
|
``google/gemini-3-pro-image``), pass reference images as ``image_url``
|
|
content parts for grounding, and read the generated images back from
|
|
``choices[0].message.images[].image_url.url`` (a ``data:image/...;base64`` URI).
|
|
|
|
Nous Portal proxies OpenRouter, so one implementation services both — we only
|
|
swap the resolved ``(base_url, api_key)``. Credentials are resolved through the
|
|
agent's existing :func:`~hermes_cli.runtime_provider.resolve_runtime_provider`,
|
|
which already understands OpenRouter's key pool and the Nous OAuth device-code
|
|
token, so this plugin never reinvents auth.
|
|
|
|
Reference grounding is the reason pet sprite generation cares about this
|
|
backend: each animation row must stay the same character as the chosen base
|
|
frame, which only works on models that accept image input. Gemini Flash Image
|
|
("nano-banana") does, so both providers advertise image-to-image support.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import logging
|
|
import mimetypes
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from agent.image_gen_provider import (
|
|
DEFAULT_ASPECT_RATIO,
|
|
ImageGenProvider,
|
|
error_response,
|
|
resolve_aspect_ratio,
|
|
save_b64_image,
|
|
save_url_image,
|
|
success_response,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Quality-first model chain for OpenRouter-compatible endpoints.
|
|
#
|
|
# Default behavior (no env/config override): try the highest-fidelity OpenAI
|
|
# image model first, then fall back to Gemini 3 Pro Image if the OpenAI model
|
|
# is access-gated / unavailable / times out on this endpoint.
|
|
#
|
|
# Explicit override (OPENROUTER_IMAGE_MODEL or image_gen.<provider>.model):
|
|
# use exactly that model (no auto fallback), so power users keep full control.
|
|
DEFAULT_MODEL = "openai/gpt-5.4-image-2"
|
|
_FALLBACK_MODEL = "google/gemini-3-pro-image"
|
|
_DEFAULT_MODEL_CHAIN = (DEFAULT_MODEL, _FALLBACK_MODEL)
|
|
|
|
# Semantic aspect ratio (the image_gen contract) → OpenRouter's image_config
|
|
# aspect_ratio strings.
|
|
_ASPECT_RATIOS = {
|
|
"square": "1:1",
|
|
"landscape": "16:9",
|
|
"portrait": "9:16",
|
|
}
|
|
|
|
# Gemini Flash Image accepts up to 3 input images per prompt; clamp references
|
|
# so we never overflow the model's limit.
|
|
_MAX_REFERENCE_IMAGES = 3
|
|
|
|
# Per single image call. The quality-first default (OpenAI image via OpenRouter)
|
|
# is genuinely slow — a single cold row can run well past 3 minutes — so give
|
|
# each call real headroom before we treat it as hung and fall back / retry.
|
|
_REQUEST_TIMEOUT = 300.0
|
|
|
|
|
|
def _load_image_gen_config() -> Dict[str, Any]:
|
|
"""Read the ``image_gen`` section from config.yaml (``{}`` on failure)."""
|
|
try:
|
|
from hermes_cli.config import load_config
|
|
|
|
cfg = load_config()
|
|
section = cfg.get("image_gen") if isinstance(cfg, dict) else None
|
|
return section if isinstance(section, dict) else {}
|
|
except Exception as exc: # noqa: BLE001 - config is best-effort
|
|
logger.debug("could not load image_gen config: %s", exc)
|
|
return {}
|
|
|
|
|
|
def _to_image_url_part(ref: str) -> Optional[str]:
|
|
"""Turn a reference (local path or http URL) into an ``image_url`` value.
|
|
|
|
Remote URLs pass through unchanged; local files are inlined as base64 data
|
|
URIs so the request is self-contained (the provider endpoint can't reach a
|
|
path on our disk). Returns ``None`` when the reference can't be read.
|
|
"""
|
|
ref = str(ref or "").strip()
|
|
if not ref:
|
|
return None
|
|
if ref.startswith(("http://", "https://", "data:")):
|
|
return ref
|
|
path = Path(ref)
|
|
try:
|
|
raw = path.read_bytes()
|
|
except OSError as exc:
|
|
logger.debug("could not read reference image %s: %s", ref, exc)
|
|
return None
|
|
mime = mimetypes.guess_type(path.name)[0] or "image/png"
|
|
encoded = base64.b64encode(raw).decode("ascii")
|
|
return f"data:{mime};base64,{encoded}"
|
|
|
|
|
|
def _extract_images(payload: Dict[str, Any]) -> List[str]:
|
|
"""Pull generated image URLs from a chat-completions response.
|
|
|
|
OpenRouter returns generated images under
|
|
``choices[0].message.images[].image_url.url`` (typically a base64 data URI).
|
|
"""
|
|
out: List[str] = []
|
|
choices = payload.get("choices") if isinstance(payload, dict) else None
|
|
if not isinstance(choices, list):
|
|
return out
|
|
for choice in choices:
|
|
message = choice.get("message") if isinstance(choice, dict) else None
|
|
images = message.get("images") if isinstance(message, dict) else None
|
|
if not isinstance(images, list):
|
|
continue
|
|
for image in images:
|
|
if not isinstance(image, dict):
|
|
continue
|
|
image_url = image.get("image_url")
|
|
url = image_url.get("url") if isinstance(image_url, dict) else None
|
|
if isinstance(url, str) and url.strip():
|
|
out.append(url.strip())
|
|
return out
|
|
|
|
|
|
def _access_error_hint(
|
|
display: str, model_id: str, env_var: str, status: int, err_msg: str
|
|
) -> Optional[str]:
|
|
"""A targeted hint when an access-gated OpenAI image model can't be reached.
|
|
|
|
Some OpenAI image models on OpenRouter need account enablement / BYOK, so the
|
|
failure isn't a missing key (the key is valid) — the *model* is unreachable.
|
|
The generic "check your key" message is misleading there, so we detect that
|
|
case and point the user at the real fix. Returns one actionable line, or
|
|
``None`` when this isn't the access-gated case.
|
|
"""
|
|
if not model_id.startswith("openai/"):
|
|
return None
|
|
low = (err_msg or "").lower()
|
|
gated = status in (402, 403, 404) or any(
|
|
s in low for s in ("no endpoints", "no allowed", "not a valid model", "data policy")
|
|
)
|
|
if not gated:
|
|
return None
|
|
return (
|
|
f"{display} can't reach image model '{model_id}' ({status}) — enable OpenAI "
|
|
f"image access in your {display} account, or set {env_var}={_FALLBACK_MODEL}."
|
|
)
|
|
|
|
|
|
def _dedupe_models(models: list[str]) -> list[str]:
|
|
out: list[str] = []
|
|
seen: set[str] = set()
|
|
for model in models:
|
|
m = (model or "").strip()
|
|
if not m or m in seen:
|
|
continue
|
|
seen.add(m)
|
|
out.append(m)
|
|
return out
|
|
|
|
|
|
class OpenRouterCompatImageProvider(ImageGenProvider):
|
|
"""Image generation over an OpenRouter-compatible chat-completions endpoint.
|
|
|
|
Instantiated once per backend (OpenRouter, Nous Portal). The two differ only
|
|
in which runtime provider supplies ``(base_url, api_key)`` and in the config
|
|
namespace used for the model override.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
provider_name: str,
|
|
display_name: str,
|
|
runtime_name: str,
|
|
config_key: str,
|
|
model_env_var: str,
|
|
setup_schema: Dict[str, Any],
|
|
) -> None:
|
|
self._name = provider_name
|
|
self._display = display_name
|
|
self._runtime_name = runtime_name
|
|
self._config_key = config_key
|
|
self._model_env_var = model_env_var
|
|
self._setup_schema = setup_schema
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return self._name
|
|
|
|
@property
|
|
def display_name(self) -> str:
|
|
return self._display
|
|
|
|
def _resolve_runtime(self) -> Dict[str, Any]:
|
|
"""Resolve ``(base_url, api_key)`` via the shared runtime resolver."""
|
|
from hermes_cli.runtime_provider import resolve_runtime_provider
|
|
|
|
return resolve_runtime_provider(requested=self._runtime_name)
|
|
|
|
def is_available(self) -> bool:
|
|
try:
|
|
runtime = self._resolve_runtime()
|
|
except Exception as exc: # noqa: BLE001 - treat resolution failure as unavailable
|
|
logger.debug("%s runtime resolution failed: %s", self._name, exc)
|
|
return False
|
|
return bool(str(runtime.get("api_key") or "").strip())
|
|
|
|
def capabilities(self) -> Dict[str, Any]:
|
|
# Both text-to-image and image-to-image (reference grounding) — the
|
|
# latter is what makes this backend usable for pet sprite rows.
|
|
return {
|
|
"modalities": ["text", "image"],
|
|
"max_reference_images": _MAX_REFERENCE_IMAGES,
|
|
}
|
|
|
|
def list_models(self) -> List[Dict[str, Any]]:
|
|
return [
|
|
{
|
|
"id": DEFAULT_MODEL,
|
|
"display": "OpenAI GPT-5.4 Image 2",
|
|
"strengths": "Highest fidelity; best prompt adherence; slower on OpenRouter",
|
|
},
|
|
{
|
|
"id": _FALLBACK_MODEL,
|
|
"display": "Gemini 3 Pro Image",
|
|
"strengths": "Fast, reliable fallback with good layout adherence",
|
|
},
|
|
]
|
|
|
|
def default_model(self) -> Optional[str]:
|
|
return self._resolve_model()
|
|
|
|
def get_setup_schema(self) -> Dict[str, Any]:
|
|
return dict(self._setup_schema)
|
|
|
|
def _resolve_model(self) -> str:
|
|
"""Pick the image model: env override → config → :data:`DEFAULT_MODEL`."""
|
|
return self._resolve_model_chain()[0]
|
|
|
|
def _resolve_model_chain(self) -> list[str]:
|
|
"""Ordered model attempts for this request.
|
|
|
|
Explicit user/model config means "use this exact model", so no fallback.
|
|
Without overrides we run the quality-first default chain.
|
|
"""
|
|
env_override = os.environ.get(self._model_env_var, "").strip()
|
|
if env_override:
|
|
return [env_override]
|
|
cfg = _load_image_gen_config()
|
|
scoped = cfg.get(self._config_key) if isinstance(cfg.get(self._config_key), dict) else {}
|
|
if isinstance(scoped, dict):
|
|
value = scoped.get("model")
|
|
if isinstance(value, str) and value.strip():
|
|
return [value.strip()]
|
|
return _dedupe_models(list(_DEFAULT_MODEL_CHAIN))
|
|
|
|
def generate(
|
|
self,
|
|
prompt: str,
|
|
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
|
*,
|
|
image_url: Optional[str] = None,
|
|
reference_image_urls: Optional[List[str]] = None,
|
|
**kwargs: Any,
|
|
) -> Dict[str, Any]:
|
|
import requests
|
|
|
|
try:
|
|
runtime = self._resolve_runtime()
|
|
except Exception as exc: # noqa: BLE001
|
|
return error_response(
|
|
error=f"Could not resolve {self._display} credentials: {exc}",
|
|
error_type="missing_api_key",
|
|
provider=self._name,
|
|
aspect_ratio=aspect_ratio,
|
|
)
|
|
api_key = str(runtime.get("api_key") or "").strip()
|
|
base_url = str(runtime.get("base_url") or "").strip().rstrip("/")
|
|
if not api_key or not base_url:
|
|
return error_response(
|
|
error=(
|
|
f"No {self._display} credentials found. "
|
|
f"Configure {self._display} in `hermes tools` → Image Generation."
|
|
),
|
|
error_type="missing_api_key",
|
|
provider=self._name,
|
|
aspect_ratio=aspect_ratio,
|
|
)
|
|
|
|
model_chain = self._resolve_model_chain()
|
|
aspect = resolve_aspect_ratio(aspect_ratio)
|
|
or_aspect = _ASPECT_RATIOS.get(aspect, "1:1")
|
|
|
|
# Collect every reference: the pet generator passes local paths via the
|
|
# ``reference_images`` kwarg; the generic tool surface uses ``image_url``
|
|
# / ``reference_image_urls``. Accept all three.
|
|
references: List[str] = []
|
|
for ref in kwargs.get("reference_images") or []:
|
|
references.append(str(ref))
|
|
if image_url:
|
|
references.append(str(image_url))
|
|
for ref in reference_image_urls or []:
|
|
references.append(str(ref))
|
|
|
|
content: List[Dict[str, Any]] = [{"type": "text", "text": prompt}]
|
|
for ref in references[:_MAX_REFERENCE_IMAGES]:
|
|
part = _to_image_url_part(ref)
|
|
if part:
|
|
content.append({"type": "image_url", "image_url": {"url": part}})
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
# OpenRouter attribution headers (harmless against Nous Portal).
|
|
"HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
|
|
"X-Title": "Hermes Agent",
|
|
}
|
|
last_error: Optional[Dict[str, Any]] = None
|
|
for i, model_id in enumerate(model_chain):
|
|
payload: Dict[str, Any] = {
|
|
"model": model_id,
|
|
"modalities": ["image", "text"],
|
|
"messages": [{"role": "user", "content": content}],
|
|
"image_config": {"aspect_ratio": or_aspect},
|
|
}
|
|
is_last = i == len(model_chain) - 1
|
|
try:
|
|
response = requests.post(
|
|
f"{base_url}/chat/completions",
|
|
headers=headers,
|
|
json=payload,
|
|
timeout=_REQUEST_TIMEOUT,
|
|
)
|
|
response.raise_for_status()
|
|
except requests.HTTPError as exc:
|
|
resp = exc.response
|
|
status = resp.status_code if resp is not None else 0
|
|
try:
|
|
err_msg = resp.json().get("error", {}).get("message", resp.text[:300])
|
|
except Exception: # noqa: BLE001
|
|
err_msg = resp.text[:300] if resp is not None else str(exc)
|
|
logger.error("%s image gen failed (%d) on %s: %s", self._name, status, model_id, err_msg)
|
|
hint = _access_error_hint(self._display, model_id, self._model_env_var, status, err_msg)
|
|
if hint and not is_last:
|
|
logger.info(
|
|
"%s model %s unavailable; retrying with fallback %s",
|
|
self._name,
|
|
model_id,
|
|
model_chain[i + 1],
|
|
)
|
|
continue
|
|
last_error = error_response(
|
|
error=hint or f"{self._display} image generation failed ({status}): {err_msg}",
|
|
error_type="model_access" if hint else "api_error",
|
|
provider=self._name,
|
|
model=model_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
return last_error
|
|
except requests.Timeout:
|
|
if not is_last:
|
|
logger.info(
|
|
"%s model %s timed out; retrying with fallback %s",
|
|
self._name,
|
|
model_id,
|
|
model_chain[i + 1],
|
|
)
|
|
continue
|
|
return error_response(
|
|
error=f"{self._display} image generation timed out "
|
|
f"({int(_REQUEST_TIMEOUT)}s)",
|
|
error_type="timeout",
|
|
provider=self._name,
|
|
model=model_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
except requests.ConnectionError as exc:
|
|
return error_response(
|
|
error=f"{self._display} connection error: {exc}",
|
|
error_type="connection_error",
|
|
provider=self._name,
|
|
model=model_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
try:
|
|
result = response.json()
|
|
except Exception as exc: # noqa: BLE001
|
|
return error_response(
|
|
error=f"{self._display} returned invalid JSON: {exc}",
|
|
error_type="invalid_response",
|
|
provider=self._name,
|
|
model=model_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
images = _extract_images(result)
|
|
if not images:
|
|
if not is_last:
|
|
logger.info(
|
|
"%s model %s returned no image; retrying with fallback %s",
|
|
self._name,
|
|
model_id,
|
|
model_chain[i + 1],
|
|
)
|
|
continue
|
|
# A response with text but no image usually means the model didn't
|
|
# honor image output (wrong model or modalities); surface that.
|
|
return error_response(
|
|
error=(
|
|
f"{self._display} returned no image. Ensure the model "
|
|
f"'{model_id}' supports image output."
|
|
),
|
|
error_type="empty_response",
|
|
provider=self._name,
|
|
model=model_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
first = images[0]
|
|
try:
|
|
if first.startswith("data:"):
|
|
b64 = first.split(",", 1)[1] if "," in first else ""
|
|
saved_path = save_b64_image(b64, prefix=f"{self._name}_gen")
|
|
else:
|
|
saved_path = save_url_image(first, prefix=f"{self._name}_gen")
|
|
except Exception as exc: # noqa: BLE001
|
|
return error_response(
|
|
error=f"Could not save generated image: {exc}",
|
|
error_type="io_error",
|
|
provider=self._name,
|
|
model=model_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
return success_response(
|
|
image=str(saved_path),
|
|
model=model_id,
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
provider=self._name,
|
|
)
|
|
|
|
return last_error or error_response(
|
|
error=f"{self._display} image generation failed after trying all candidate models.",
|
|
error_type="api_error",
|
|
provider=self._name,
|
|
model=model_chain[-1] if model_chain else "",
|
|
prompt=prompt,
|
|
aspect_ratio=aspect,
|
|
)
|
|
|
|
|
|
def _build_providers() -> List[OpenRouterCompatImageProvider]:
|
|
return [
|
|
OpenRouterCompatImageProvider(
|
|
provider_name="openrouter",
|
|
display_name="OpenRouter",
|
|
runtime_name="openrouter",
|
|
config_key="openrouter",
|
|
model_env_var="OPENROUTER_IMAGE_MODEL",
|
|
setup_schema={
|
|
"name": "OpenRouter (image)",
|
|
"badge": "paid",
|
|
"tag": "Gemini Flash Image & more via OpenRouter; uses OPENROUTER_API_KEY",
|
|
"env_vars": [
|
|
{
|
|
"key": "OPENROUTER_API_KEY",
|
|
"prompt": "OpenRouter API key",
|
|
"url": "https://openrouter.ai/keys",
|
|
}
|
|
],
|
|
},
|
|
),
|
|
OpenRouterCompatImageProvider(
|
|
provider_name="nous",
|
|
display_name="Nous Portal",
|
|
runtime_name="nous",
|
|
config_key="nous",
|
|
model_env_var="NOUS_IMAGE_MODEL",
|
|
setup_schema={
|
|
"name": "Nous Portal (image)",
|
|
"badge": "subscription",
|
|
"tag": "Reference-grounded image generation via Nous Portal (OpenRouter-backed)",
|
|
"env_vars": [],
|
|
"requires_nous_auth": True,
|
|
},
|
|
),
|
|
]
|
|
|
|
|
|
def register(ctx: Any) -> None:
|
|
"""Register the OpenRouter + Nous Portal image gen providers."""
|
|
for provider in _build_providers():
|
|
ctx.register_image_gen_provider(provider)
|