mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(image-gen): add GPT Image 2 to FAL catalog (#13677)
Adds OpenAI's new GPT Image 2 model via FAL.ai, selectable through `hermes tools` → Image Generation. SOTA text rendering (including CJK) and world-aware photorealism. - FAL_MODELS entry with image_size_preset style - 4:3 presets on all aspect ratios — 16:9 (1024x576) falls below GPT-Image-2's 655,360 min-pixel floor and would be rejected - quality pinned to medium (same rule as gpt-image-1.5) for predictable Nous Portal billing - BYOK (openai_api_key) deliberately omitted from supports so all users stay on shared FAL billing - 6 new tests covering preset mapping, quality pinning, and supports-whitelist integrity - Docs table + aspect-ratio map updated Live-tested end-to-end: 39.9s cold request, clean 1024x768 PNG
This commit is contained in:
parent
e889332c99
commit
5ffae9228b
3 changed files with 90 additions and 11 deletions
|
|
@ -136,6 +136,49 @@ class TestGptLiteralFamily:
|
||||||
assert p["image_size"] == "1024x1536"
|
assert p["image_size"] == "1024x1536"
|
||||||
|
|
||||||
|
|
||||||
|
class TestGptImage2Presets:
|
||||||
|
"""GPT Image 2 uses preset enum sizes (not literal strings like 1.5).
|
||||||
|
Mapped to 4:3 variants so we stay above the 655,360 min-pixel floor
|
||||||
|
(16:9 presets at 1024x576 = 589,824 would be rejected)."""
|
||||||
|
|
||||||
|
def test_gpt2_landscape_uses_4_3_preset(self, image_tool):
|
||||||
|
p = image_tool._build_fal_payload("fal-ai/gpt-image-2", "hello", "landscape")
|
||||||
|
assert p["image_size"] == "landscape_4_3"
|
||||||
|
|
||||||
|
def test_gpt2_square_uses_square_hd(self, image_tool):
|
||||||
|
p = image_tool._build_fal_payload("fal-ai/gpt-image-2", "hello", "square")
|
||||||
|
assert p["image_size"] == "square_hd"
|
||||||
|
|
||||||
|
def test_gpt2_portrait_uses_4_3_preset(self, image_tool):
|
||||||
|
p = image_tool._build_fal_payload("fal-ai/gpt-image-2", "hello", "portrait")
|
||||||
|
assert p["image_size"] == "portrait_4_3"
|
||||||
|
|
||||||
|
def test_gpt2_quality_pinned_to_medium(self, image_tool):
|
||||||
|
p = image_tool._build_fal_payload("fal-ai/gpt-image-2", "hi", "square")
|
||||||
|
assert p["quality"] == "medium"
|
||||||
|
|
||||||
|
def test_gpt2_strips_byok_and_unsupported_overrides(self, image_tool):
|
||||||
|
"""openai_api_key (BYOK) is deliberately not in supports — all users
|
||||||
|
route through shared FAL billing. guidance_scale/num_inference_steps
|
||||||
|
aren't in the model's API surface either."""
|
||||||
|
p = image_tool._build_fal_payload(
|
||||||
|
"fal-ai/gpt-image-2", "hi", "square",
|
||||||
|
overrides={
|
||||||
|
"openai_api_key": "sk-...",
|
||||||
|
"guidance_scale": 7.5,
|
||||||
|
"num_inference_steps": 50,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert "openai_api_key" not in p
|
||||||
|
assert "guidance_scale" not in p
|
||||||
|
assert "num_inference_steps" not in p
|
||||||
|
|
||||||
|
def test_gpt2_strips_seed_even_if_passed(self, image_tool):
|
||||||
|
# seed isn't in the GPT Image 2 API surface either.
|
||||||
|
p = image_tool._build_fal_payload("fal-ai/gpt-image-2", "hi", "square", seed=42)
|
||||||
|
assert "seed" not in p
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Supports whitelist — the main safety property
|
# Supports whitelist — the main safety property
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -231,10 +274,11 @@ class TestGptQualityPinnedToMedium:
|
||||||
assert p["quality"] == "medium"
|
assert p["quality"] == "medium"
|
||||||
|
|
||||||
def test_non_gpt_model_never_gets_quality(self, image_tool):
|
def test_non_gpt_model_never_gets_quality(self, image_tool):
|
||||||
"""quality is only meaningful for gpt-image-1.5 — other models should
|
"""quality is only meaningful for GPT-Image models (1.5, 2) — other
|
||||||
never have it in their payload."""
|
models should never have it in their payload."""
|
||||||
|
gpt_models = {"fal-ai/gpt-image-1.5", "fal-ai/gpt-image-2"}
|
||||||
for mid in image_tool.FAL_MODELS:
|
for mid in image_tool.FAL_MODELS:
|
||||||
if mid == "fal-ai/gpt-image-1.5":
|
if mid in gpt_models:
|
||||||
continue
|
continue
|
||||||
p = image_tool._build_fal_payload(mid, "hi", "square")
|
p = image_tool._build_fal_payload(mid, "hi", "square")
|
||||||
assert "quality" not in p, f"{mid} unexpectedly has 'quality' in payload"
|
assert "quality" not in p, f"{mid} unexpectedly has 'quality' in payload"
|
||||||
|
|
|
||||||
|
|
@ -188,6 +188,38 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
|
||||||
},
|
},
|
||||||
"upscale": False,
|
"upscale": False,
|
||||||
},
|
},
|
||||||
|
"fal-ai/gpt-image-2": {
|
||||||
|
"display": "GPT Image 2",
|
||||||
|
"speed": "~20s",
|
||||||
|
"strengths": "SOTA text rendering + CJK, world-aware photorealism",
|
||||||
|
"price": "$0.04–0.06/image",
|
||||||
|
# GPT Image 2 uses FAL's standard preset enum (unlike 1.5's literal
|
||||||
|
# dimensions). We map to the 4:3 variants — the 16:9 presets
|
||||||
|
# (1024x576) fall below GPT-Image-2's 655,360 min-pixel requirement
|
||||||
|
# and would be rejected. 4:3 keeps us above the minimum on all
|
||||||
|
# three aspect ratios.
|
||||||
|
"size_style": "image_size_preset",
|
||||||
|
"sizes": {
|
||||||
|
"landscape": "landscape_4_3", # 1024x768
|
||||||
|
"square": "square_hd", # 1024x1024
|
||||||
|
"portrait": "portrait_4_3", # 768x1024
|
||||||
|
},
|
||||||
|
"defaults": {
|
||||||
|
# Same quality pinning as gpt-image-1.5: medium keeps Nous
|
||||||
|
# Portal billing predictable. "high" is 3-4x the per-image
|
||||||
|
# cost at the same size; "low" is too rough for production use.
|
||||||
|
"quality": "medium",
|
||||||
|
"num_images": 1,
|
||||||
|
"output_format": "png",
|
||||||
|
},
|
||||||
|
"supports": {
|
||||||
|
"prompt", "image_size", "quality", "num_images", "output_format",
|
||||||
|
"sync_mode",
|
||||||
|
# openai_api_key (BYOK) intentionally omitted — all users go
|
||||||
|
# through the shared FAL billing path.
|
||||||
|
},
|
||||||
|
"upscale": False,
|
||||||
|
},
|
||||||
"fal-ai/ideogram/v3": {
|
"fal-ai/ideogram/v3": {
|
||||||
"display": "Ideogram V3",
|
"display": "Ideogram V3",
|
||||||
"speed": "~5s",
|
"speed": "~5s",
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,13 @@
|
||||||
---
|
---
|
||||||
title: Image Generation
|
title: Image Generation
|
||||||
description: Generate images via FAL.ai — 8 models including FLUX 2, GPT-Image, Nano Banana Pro, Ideogram, Recraft V4 Pro, and more, selectable via `hermes tools`.
|
description: Generate images via FAL.ai — 9 models including FLUX 2, GPT Image (1.5 & 2), Nano Banana Pro, Ideogram, Recraft V4 Pro, and more, selectable via `hermes tools`.
|
||||||
sidebar_label: Image Generation
|
sidebar_label: Image Generation
|
||||||
sidebar_position: 6
|
sidebar_position: 6
|
||||||
---
|
---
|
||||||
|
|
||||||
# Image Generation
|
# Image Generation
|
||||||
|
|
||||||
Hermes Agent generates images from text prompts via FAL.ai. Eight models are supported out of the box, each with different speed, quality, and cost tradeoffs. The active model is user-configurable via `hermes tools` and persists in `config.yaml`.
|
Hermes Agent generates images from text prompts via FAL.ai. Nine models are supported out of the box, each with different speed, quality, and cost tradeoffs. The active model is user-configurable via `hermes tools` and persists in `config.yaml`.
|
||||||
|
|
||||||
## Supported Models
|
## Supported Models
|
||||||
|
|
||||||
|
|
@ -18,6 +18,7 @@ Hermes Agent generates images from text prompts via FAL.ai. Eight models are sup
|
||||||
| `fal-ai/z-image/turbo` | ~2s | Bilingual EN/CN, 6B params | $0.005/MP |
|
| `fal-ai/z-image/turbo` | ~2s | Bilingual EN/CN, 6B params | $0.005/MP |
|
||||||
| `fal-ai/nano-banana-pro` | ~8s | Gemini 3 Pro, reasoning depth, text rendering | $0.15/image (1K) |
|
| `fal-ai/nano-banana-pro` | ~8s | Gemini 3 Pro, reasoning depth, text rendering | $0.15/image (1K) |
|
||||||
| `fal-ai/gpt-image-1.5` | ~15s | Prompt adherence | $0.034/image |
|
| `fal-ai/gpt-image-1.5` | ~15s | Prompt adherence | $0.034/image |
|
||||||
|
| `fal-ai/gpt-image-2` | ~20s | SOTA text rendering + CJK, world-aware photorealism | $0.04–0.06/image |
|
||||||
| `fal-ai/ideogram/v3` | ~5s | Best typography | $0.03–0.09/image |
|
| `fal-ai/ideogram/v3` | ~5s | Best typography | $0.03–0.09/image |
|
||||||
| `fal-ai/recraft/v4/pro/text-to-image` | ~8s | Design, brand systems, production-ready | $0.25/image |
|
| `fal-ai/recraft/v4/pro/text-to-image` | ~8s | Design, brand systems, production-ready | $0.25/image |
|
||||||
| `fal-ai/qwen-image` | ~12s | LLM-based, complex text | $0.02/MP |
|
| `fal-ai/qwen-image` | ~12s | LLM-based, complex text | $0.02/MP |
|
||||||
|
|
@ -65,7 +66,7 @@ image_gen:
|
||||||
|
|
||||||
### GPT-Image Quality
|
### GPT-Image Quality
|
||||||
|
|
||||||
The `fal-ai/gpt-image-1.5` request quality is pinned to `medium` (~$0.034/image at 1024×1024). We don't expose the `low` / `high` tiers as a user-facing option so that Nous Portal billing stays predictable across all users — the cost spread between tiers is ~22×. If you want a cheaper GPT-Image option, pick a different model; if you want higher quality, use Klein 9B or Imagen-class models.
|
The `fal-ai/gpt-image-1.5` and `fal-ai/gpt-image-2` request quality is pinned to `medium` (~$0.034–$0.06/image at 1024×1024). We don't expose the `low` / `high` tiers as a user-facing option so that Nous Portal billing stays predictable across all users — the cost spread between tiers is 3–22×. If you want a cheaper option, pick Klein 9B or Z-Image Turbo; if you want higher quality, use Nano Banana Pro or Recraft V4 Pro.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
@ -87,11 +88,13 @@ Make me a futuristic cityscape, landscape orientation
|
||||||
|
|
||||||
Every model accepts the same three aspect ratios from the agent's perspective. Internally, each model's native size spec is filled in automatically:
|
Every model accepts the same three aspect ratios from the agent's perspective. Internally, each model's native size spec is filled in automatically:
|
||||||
|
|
||||||
| Agent input | image_size (flux/z-image/qwen/recraft/ideogram) | aspect_ratio (nano-banana-pro) | image_size (gpt-image) |
|
| Agent input | image_size (flux/z-image/qwen/recraft/ideogram) | aspect_ratio (nano-banana-pro) | image_size (gpt-image-1.5) | image_size (gpt-image-2) |
|
||||||
|---|---|---|---|
|
|---|---|---|---|---|
|
||||||
| `landscape` | `landscape_16_9` | `16:9` | `1536x1024` |
|
| `landscape` | `landscape_16_9` | `16:9` | `1536x1024` | `landscape_4_3` (1024×768) |
|
||||||
| `square` | `square_hd` | `1:1` | `1024x1024` |
|
| `square` | `square_hd` | `1:1` | `1024x1024` | `square_hd` (1024×1024) |
|
||||||
| `portrait` | `portrait_16_9` | `9:16` | `1024x1536` |
|
| `portrait` | `portrait_16_9` | `9:16` | `1024x1536` | `portrait_4_3` (768×1024) |
|
||||||
|
|
||||||
|
GPT Image 2 maps to 4:3 presets rather than 16:9 because its minimum pixel count is 655,360 — the `landscape_16_9` preset (1024×576 = 589,824) would be rejected.
|
||||||
|
|
||||||
This translation happens in `_build_fal_payload()` — agent code never has to know about per-model schema differences.
|
This translation happens in `_build_fal_payload()` — agent code never has to know about per-model schema differences.
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue