diff --git a/tests/tools/test_image_generation.py b/tests/tools/test_image_generation.py index cf4e087068..4cde05fb4e 100644 --- a/tests/tools/test_image_generation.py +++ b/tests/tools/test_image_generation.py @@ -107,16 +107,16 @@ class TestAspectRatioFamily: """Nano-banana uses aspect_ratio enum, NOT image_size.""" def test_nano_banana_landscape_uses_aspect_ratio(self, image_tool): - p = image_tool._build_fal_payload("fal-ai/nano-banana", "hello", "landscape") + p = image_tool._build_fal_payload("fal-ai/nano-banana-pro", "hello", "landscape") assert p["aspect_ratio"] == "16:9" assert "image_size" not in p def test_nano_banana_square_uses_aspect_ratio(self, image_tool): - p = image_tool._build_fal_payload("fal-ai/nano-banana", "hello", "square") + p = image_tool._build_fal_payload("fal-ai/nano-banana-pro", "hello", "square") assert p["aspect_ratio"] == "1:1" def test_nano_banana_portrait_uses_aspect_ratio(self, image_tool): - p = image_tool._build_fal_payload("fal-ai/nano-banana", "hello", "portrait") + p = image_tool._build_fal_payload("fal-ai/nano-banana-pro", "hello", "portrait") assert p["aspect_ratio"] == "9:16" @@ -164,13 +164,17 @@ class TestSupportsFilter: assert "num_inference_steps" not in p def test_recraft_has_minimal_payload(self, image_tool): - # Recraft supports prompt, image_size, style only. - p = image_tool._build_fal_payload("fal-ai/recraft-v3", "hi", "landscape") - assert set(p.keys()) <= {"prompt", "image_size", "style"} + # Recraft V4 Pro supports prompt, image_size, enable_safety_checker, + # colors, background_color (no seed, no style — V4 dropped V3's style enum). + p = image_tool._build_fal_payload("fal-ai/recraft/v4/pro/text-to-image", "hi", "landscape") + assert set(p.keys()) <= { + "prompt", "image_size", "enable_safety_checker", + "colors", "background_color", + } def test_nano_banana_never_gets_image_size(self, image_tool): # Common bug: translator accidentally setting both image_size and aspect_ratio. - p = image_tool._build_fal_payload("fal-ai/nano-banana", "hi", "landscape", seed=1) + p = image_tool._build_fal_payload("fal-ai/nano-banana-pro", "hi", "landscape", seed=1) assert "image_size" not in p assert p["aspect_ratio"] == "16:9" @@ -285,9 +289,9 @@ class TestModelResolution: def test_config_wins_over_env_var(self, image_tool, monkeypatch): monkeypatch.setenv("FAL_IMAGE_MODEL", "fal-ai/z-image/turbo") with patch("hermes_cli.config.load_config", - return_value={"image_gen": {"model": "fal-ai/nano-banana"}}): + return_value={"image_gen": {"model": "fal-ai/nano-banana-pro"}}): mid, _ = image_tool._resolve_fal_model() - assert mid == "fal-ai/nano-banana" + assert mid == "fal-ai/nano-banana-pro" # --------------------------------------------------------------------------- @@ -387,10 +391,10 @@ class TestManagedGatewayErrorTranslation: lambda gw: mock_managed_client) with pytest.raises(ValueError) as exc_info: - image_tool._submit_fal_request("fal-ai/nano-banana", {"prompt": "x"}) + image_tool._submit_fal_request("fal-ai/nano-banana-pro", {"prompt": "x"}) msg = str(exc_info.value) - assert "fal-ai/nano-banana" in msg + assert "fal-ai/nano-banana-pro" in msg assert "403" in msg assert "FAL_KEY" in msg assert "hermes tools" in msg diff --git a/tools/image_generation_tool.py b/tools/image_generation_tool.py index 8871b8df5d..cf1003d12b 100644 --- a/tools/image_generation_tool.py +++ b/tools/image_generation_tool.py @@ -134,11 +134,11 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { }, "upscale": False, }, - "fal-ai/nano-banana": { - "display": "Nano Banana (Gemini 2.5 Flash Image)", - "speed": "~6s", - "strengths": "Gemini 2.5, consistency", - "price": "$0.08/image", + "fal-ai/nano-banana-pro": { + "display": "Nano Banana Pro (Gemini 3 Pro Image)", + "speed": "~8s", + "strengths": "Gemini 3 Pro, reasoning depth, text rendering", + "price": "$0.15/image (1K)", "size_style": "aspect_ratio", "sizes": { "landscape": "16:9", @@ -149,10 +149,14 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { "num_images": 1, "output_format": "png", "safety_tolerance": "5", + # "1K" is the cheapest tier; 4K doubles the per-image cost. + # Users on Nous Subscription should stay at 1K for predictable billing. + "resolution": "1K", }, "supports": { "prompt", "aspect_ratio", "num_images", "output_format", - "safety_tolerance", "seed", "sync_mode", + "safety_tolerance", "seed", "sync_mode", "resolution", + "enable_web_search", "limit_generations", }, "upscale": False, }, @@ -202,11 +206,11 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { }, "upscale": False, }, - "fal-ai/recraft-v3": { - "display": "Recraft V3", + "fal-ai/recraft/v4/pro/text-to-image": { + "display": "Recraft V4 Pro", "speed": "~8s", - "strengths": "Vector, brand styles", - "price": "$0.04/image", + "strengths": "Design, brand systems, production-ready", + "price": "$0.25/image", "size_style": "image_size_preset", "sizes": { "landscape": "landscape_16_9", @@ -214,10 +218,12 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = { "portrait": "portrait_16_9", }, "defaults": { - "style": "realistic_image", + # V4 Pro dropped V3's required `style` enum — defaults handle taste now. + "enable_safety_checker": False, }, "supports": { - "prompt", "image_size", "style", + "prompt", "image_size", "enable_safety_checker", + "colors", "background_color", }, "upscale": False, }, diff --git a/website/docs/user-guide/features/image-generation.md b/website/docs/user-guide/features/image-generation.md index 701d4a4fa2..43abc6c201 100644 --- a/website/docs/user-guide/features/image-generation.md +++ b/website/docs/user-guide/features/image-generation.md @@ -1,6 +1,6 @@ --- title: Image Generation -description: Generate images via FAL.ai — 8 models including FLUX 2, GPT-Image, Nano Banana, Ideogram, and more, selectable via `hermes tools`. +description: Generate images via FAL.ai — 8 models including FLUX 2, GPT-Image, Nano Banana Pro, Ideogram, Recraft V4 Pro, and more, selectable via `hermes tools`. sidebar_label: Image Generation sidebar_position: 6 --- @@ -13,13 +13,13 @@ Hermes Agent generates images from text prompts via FAL.ai. Eight models are sup | Model | Speed | Strengths | Price | |---|---|---|---| -| `fal-ai/flux-2/klein/9b` *(default)* | <1s | Fast, crisp text | $0.006/MP | +| `fal-ai/flux-2/klein/9b` *(default)* | `<1s` | Fast, crisp text | $0.006/MP | | `fal-ai/flux-2-pro` | ~6s | Studio photorealism | $0.03/MP | | `fal-ai/z-image/turbo` | ~2s | Bilingual EN/CN, 6B params | $0.005/MP | -| `fal-ai/nano-banana` | ~6s | Gemini 2.5, character consistency | $0.08/image | +| `fal-ai/nano-banana-pro` | ~8s | Gemini 3 Pro, reasoning depth, text rendering | $0.15/image (1K) | | `fal-ai/gpt-image-1.5` | ~15s | Prompt adherence | $0.034/image | | `fal-ai/ideogram/v3` | ~5s | Best typography | $0.03–0.09/image | -| `fal-ai/recraft-v3` | ~8s | Vector art, brand styles | $0.04/image | +| `fal-ai/recraft/v4/pro/text-to-image` | ~8s | Design, brand systems, production-ready | $0.25/image | | `fal-ai/qwen-image` | ~12s | LLM-based, complex text | $0.02/MP | Prices are FAL's pricing at time of writing; check [fal.ai](https://fal.ai/) for current numbers. @@ -87,7 +87,7 @@ Make me a futuristic cityscape, landscape orientation Every model accepts the same three aspect ratios from the agent's perspective. Internally, each model's native size spec is filled in automatically: -| Agent input | image_size (flux/z-image/qwen/recraft/ideogram) | aspect_ratio (nano-banana) | image_size (gpt-image) | +| Agent input | image_size (flux/z-image/qwen/recraft/ideogram) | aspect_ratio (nano-banana-pro) | image_size (gpt-image) | |---|---|---|---| | `landscape` | `landscape_16_9` | `16:9` | `1536x1024` | | `square` | `square_hd` | `1:1` | `1024x1024` | diff --git a/website/docs/user-guide/features/overview.md b/website/docs/user-guide/features/overview.md index 10ecb90bae..df3c26becf 100644 --- a/website/docs/user-guide/features/overview.md +++ b/website/docs/user-guide/features/overview.md @@ -30,7 +30,7 @@ Hermes Agent includes a rich set of capabilities that extend far beyond basic ch - **[Voice Mode](voice-mode.md)** — Full voice interaction across CLI and messaging platforms. Talk to the agent using your microphone, hear spoken replies, and have live voice conversations in Discord voice channels. - **[Browser Automation](browser.md)** — Full browser automation with multiple backends: Browserbase cloud, Browser Use cloud, local Chrome via CDP, or local Chromium. Navigate websites, fill forms, and extract information. - **[Vision & Image Paste](vision.md)** — Multimodal vision support. Paste images from your clipboard into the CLI and ask the agent to analyze, describe, or work with them using any vision-capable model. -- **[Image Generation](image-generation.md)** — Generate images from text prompts using FAL.ai. Eight models supported (FLUX 2 Klein/Pro, GPT-Image 1.5, Nano Banana, Ideogram V3, Recraft V3, Qwen, Z-Image Turbo); pick one via `hermes tools`. +- **[Image Generation](image-generation.md)** — Generate images from text prompts using FAL.ai. Eight models supported (FLUX 2 Klein/Pro, GPT-Image 1.5, Nano Banana Pro, Ideogram V3, Recraft V4 Pro, Qwen, Z-Image Turbo); pick one via `hermes tools`. - **[Voice & TTS](tts.md)** — Text-to-speech output and voice message transcription across all messaging platforms, with five provider options: Edge TTS (free), ElevenLabs, OpenAI TTS, MiniMax, and NeuTTS. ## Integrations diff --git a/website/docs/user-guide/features/tool-gateway.md b/website/docs/user-guide/features/tool-gateway.md index b33f8e09d0..9b1b4f4f3a 100644 --- a/website/docs/user-guide/features/tool-gateway.md +++ b/website/docs/user-guide/features/tool-gateway.md @@ -18,7 +18,7 @@ The **Tool Gateway** lets paid [Nous Portal](https://portal.nousresearch.com) su | Tool | What It Does | Direct Alternative | |------|--------------|--------------------| | **Web search & extract** | Search the web and extract page content via Firecrawl | `FIRECRAWL_API_KEY`, `EXA_API_KEY`, `PARALLEL_API_KEY`, `TAVILY_API_KEY` | -| **Image generation** | Generate images via FAL (8 models: FLUX 2 Klein/Pro, GPT-Image, Nano Banana, Ideogram, Recraft, Qwen, Z-Image) | `FAL_KEY` | +| **Image generation** | Generate images via FAL (8 models: FLUX 2 Klein/Pro, GPT-Image, Nano Banana Pro, Ideogram, Recraft V4 Pro, Qwen, Z-Image) | `FAL_KEY` | | **Text-to-speech** | Convert text to speech via OpenAI TTS | `VOICE_TOOLS_OPENAI_KEY`, `ELEVENLABS_API_KEY` | | **Browser automation** | Control cloud browsers via Browser Use | `BROWSER_USE_API_KEY`, `BROWSERBASE_API_KEY` |