Merge pull request #52303 from NousResearch/bb/pets-gen-qa

feat(pets): quality-first OpenRouter chain, stronger atlas gates, global pet-gen notifications
This commit is contained in:
brooklyn! 2026-06-24 23:16:40 -05:00 committed by GitHub
commit 0c442fa1d3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 918 additions and 194 deletions

View file

@ -127,6 +127,22 @@ def _near_key_mask(image, key: tuple[int, int, int], tol: int = 48):
)
def _defringe(rgba):
"""Shave the 1px antialiased edge ring left after keying.
Chroma keying can't catch the antialiased band where the sprite meets the
backdrop those pixels are a key/sprite blend, too far from the key to be
removed, so they ring the cutout in magenta/green. Erode the alpha by one
pixel (a 3x3 min filter) to drop that contaminated ring; the sprite's own
thick dark outline keeps the silhouette intact. Built on a C-level filter, no
per-pixel Python.
"""
from PIL import ImageFilter
rgba.putalpha(rgba.getchannel("A").filter(ImageFilter.MinFilter(3)))
return rgba
def remove_background(image, *, chroma_key: tuple[int, int, int] | None = None, threshold: float = 90.0):
"""Return *image* (RGBA) with its flat background keyed out to transparent.
@ -163,7 +179,8 @@ def remove_background(image, *, chroma_key: tuple[int, int, int] | None = None,
near = _near_key_mask(rgba, key) # L mask, 255 where near key
opaque = rgba.getchannel("A").point(lambda a: 255 if a > _ALPHA_FLOOR else 0)
remove_mask = ImageChops.darker(near, opaque)
return Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, remove_mask)
keyed = Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, remove_mask)
return _defringe(keyed)
visited = bytearray(w * h)
# Mark removals in a flat mask and apply them in one C composite at the end —
@ -209,7 +226,7 @@ def remove_background(image, *, chroma_key: tuple[int, int, int] | None = None,
# One C-level composite instead of millions of per-pixel writes: paint the
# flooded pixels to (0,0,0,0) wherever the mask is set.
mask = Image.frombytes("L", (w, h), bytes(remove)).point(lambda v: 255 if v else 0)
return Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, mask)
return _defringe(Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, mask))
def _repair_internal_alpha_holes(image):
@ -369,6 +386,279 @@ def _drop_side_bleed(image):
return rgba
def _erase_long_axis_lines(image):
"""Remove thin slot-spanning guide/floor/divider lines.
Gemini will sometimes satisfy "baseline" / "cell" language by drawing
literal horizontal floors or vertical panel dividers. They survive chroma
keying and connect otherwise clean poses. Drop only *thin* rows/columns that
span nearly the whole slot; thick sprite body rows are left alone.
"""
from PIL import Image
rgba = image.convert("RGBA").copy()
w, h = rgba.size
alpha = rgba.getchannel("A")
def _thin_groups(indices: list[int]) -> list[tuple[int, int]]:
groups: list[tuple[int, int]] = []
start: int | None = None
prev: int | None = None
for idx in indices:
if start is None:
start = prev = idx
continue
if prev is not None and idx == prev + 1:
prev = idx
continue
if start is not None and prev is not None and prev - start + 1 <= 4:
groups.append((start, prev + 1))
start = prev = idx
if start is not None and prev is not None and prev - start + 1 <= 4:
groups.append((start, prev + 1))
return groups
wide_rows = [
y
for y in range(h)
if sum(1 for x in range(w) if alpha.getpixel((x, y)) > _ALPHA_FLOOR) >= w * 0.85
]
tall_cols = [
x
for x in range(w)
if sum(1 for y in range(h) if alpha.getpixel((x, y)) > _ALPHA_FLOOR) >= h * 0.85
]
clear = Image.new("RGBA", rgba.size, (0, 0, 0, 0))
for top, bottom in _thin_groups(wide_rows):
rgba.paste(clear.crop((0, top, w, bottom)), (0, top))
for left, right in _thin_groups(tall_cols):
rgba.paste(clear.crop((left, 0, right, h)), (left, 0))
return rgba
def _component_boxes(image) -> list[tuple[tuple[int, int, int, int], int]]:
"""Connected opaque components as ``[(bbox, mass)]``.
A full ML segmenter would be overkill here: after chroma keying, "the pet" is
the dominant connected alpha component inside each known slot. Tiny detached
sparkles, tears, UI dots, and neighbour slivers are separate components.
"""
from collections import deque
rgba = image.convert("RGBA")
bbox = rgba.getbbox()
if bbox is None:
return []
l0, t0, r0, b0 = bbox
w, h = r0 - l0, b0 - t0
alpha = rgba.getchannel("A").load()
visited = bytearray(w * h)
out: list[tuple[tuple[int, int, int, int], int]] = []
for start in range(w * h):
if visited[start]:
continue
sx, sy = start % w, start // w
ax, ay = l0 + sx, t0 + sy
visited[start] = 1
if alpha[ax, ay] <= _ALPHA_FLOOR:
continue
queue: deque[tuple[int, int]] = deque([(sx, sy)])
left = right = sx
top = bottom = sy
mass = 0
while queue:
x, y = queue.popleft()
mass += 1
left, right = min(left, x), max(right, x)
top, bottom = min(top, y), max(bottom, y)
for nx, ny in ((x + 1, y), (x - 1, y), (x, y + 1), (x, y - 1)):
if 0 <= nx < w and 0 <= ny < h:
idx = ny * w + nx
if not visited[idx]:
visited[idx] = 1
if alpha[l0 + nx, t0 + ny] > _ALPHA_FLOOR:
queue.append((nx, ny))
out.append(((l0 + left, t0 + top, l0 + right + 1, t0 + bottom + 1), mass))
return out
def _isolate_slot_subject(image):
"""Keep the slot's real subject; drop detached effects/noise."""
from PIL import Image
rgba = _erase_long_axis_lines(image)
comps = _component_boxes(rgba)
if not comps:
return rgba
main_box, main_mass = max(comps, key=lambda item: item[1])
ml, mt, mr, mb = main_box
mw = max(1, mr - ml)
keep: list[tuple[int, int, int, int]] = []
for box, mass in comps:
if box == main_box:
keep.append(box)
continue
left, _top, right, _bottom = box
overlap = max(0, min(right, mr) - max(left, ml))
center_x = (left + right) / 2
near_main = (ml - mw * 0.25) <= center_x <= (mr + mw * 0.25)
# Keep meaningful attached-looking accessories such as halos; drop
# sparkles/tears/noise that don't overlap the body column.
if mass >= max(24, main_mass * 0.035) and (overlap >= mw * 0.3 or near_main):
keep.append(box)
out = Image.new("RGBA", rgba.size, (0, 0, 0, 0))
for box in keep:
out.alpha_composite(rgba.crop(box), (box[0], box[1]))
return out
def _has_slot_padding(image) -> bool:
"""True when content has empty room on all four slot edges."""
bbox = image.getbbox()
if bbox is None:
return False
w, h = image.size
left, top, right, bottom = bbox
min_x = max(4, min(12, round(w * 0.025)))
min_y = max(4, min(16, round(h * 0.02)))
return left >= min_x and top >= min_y and w - right >= min_x and h - bottom >= min_y
def _slot_bounds(width: int, frame_count: int) -> list[tuple[int, int]]:
return [
(round(i * width / frame_count), round((i + 1) * width / frame_count))
for i in range(frame_count)
]
def _group_component_rows(boxes: list[tuple[int, int, int, int]]) -> list[list[tuple[int, int, int, int]]]:
"""Group component boxes into visual rows, then sort left→right."""
if not boxes:
return []
heights = sorted(max(1, b[3] - b[1]) for b in boxes)
row_tol = max(12, heights[len(heights) // 2] * 0.55)
rows: list[list[tuple[int, int, int, int]]] = []
centers: list[float] = []
for box in sorted(boxes, key=lambda b: (b[1] + b[3]) / 2):
cy = (box[1] + box[3]) / 2
for i, center in enumerate(centers):
if abs(cy - center) <= row_tol:
rows[i].append(box)
centers[i] = sum((b[1] + b[3]) / 2 for b in rows[i]) / len(rows[i])
break
else:
rows.append([box])
centers.append(cy)
ordered = [row for _center, row in sorted(zip(centers, rows, strict=False), key=lambda item: item[0])]
for row in ordered:
row.sort(key=lambda b: (b[0] + b[2]) / 2)
return ordered
def _merge_related_boxes(boxes: list[tuple[int, int, int, int]]) -> list[tuple[int, int, int, int]]:
"""Merge disconnected parts that clearly belong to one subject.
Capes, tails, horns, and held props sometimes key as separate components.
Merge components on the same visual row when their vertical spans overlap and
the horizontal gap is tiny compared with the component size. Do not bridge the
much larger gaps between separate poses.
"""
boxes = list(boxes)
changed = True
while changed:
changed = False
merged: list[tuple[int, int, int, int]] = []
used = [False] * len(boxes)
for i, a in enumerate(boxes):
if used[i]:
continue
al, at, ar, ab = a
used[i] = True
for j in range(i + 1, len(boxes)):
if used[j]:
continue
bl, bt, br, bb = boxes[j]
v_overlap = max(0, min(ab, bb) - max(at, bt))
min_h = max(1, min(ab - at, bb - bt))
gap = max(0, max(al, bl) - min(ar, br))
min_w = max(1, min(ar - al, br - bl))
if v_overlap >= min_h * 0.45 and gap <= max(14, min_w * 0.22):
al, at, ar, ab = min(al, bl), min(at, bt), max(ar, br), max(ab, bb)
used[j] = True
changed = True
merged.append((al, at, ar, ab))
boxes = merged
return boxes
def _component_crops(strip, frame_count: int, *, require_padding: bool = False) -> list | None:
"""Extract frame subjects as connected non-background objects.
This is the robust path for models that ignore "one horizontal row" and emit a
2D sprite grid. We count real opaque subject components, discard tiny
detached effects, sort in reading order, and return exactly *frame_count*
frames. Slot slicing is only a fallback when object detection can't satisfy
the contract.
"""
from PIL import Image
def attempt(source) -> list | None:
comps = _component_boxes(source)
if not comps:
return None
max_mass = max(m for _box, m in comps)
subjects = _merge_related_boxes([box for box, mass in comps if mass >= max(64, max_mass * 0.12)])
if len(subjects) < frame_count:
return None
rows = _group_component_rows(subjects)
ordered = [box for row in rows for box in row][:frame_count]
if len(ordered) < frame_count:
return None
if require_padding:
min_x = max(4, min(12, round(source.width * 0.01)))
min_y = max(4, min(16, round(source.height * 0.015)))
for left, top, right, bottom in ordered:
if left < min_x or top < min_y or source.width - right < min_x or source.height - bottom < min_y:
return None
multirow = len(rows) > 1
frames = []
for left, top, right, bottom in ordered:
pad_x = max(8, round((right - left) * 0.08))
pad_y = max(8, round((bottom - top) * 0.08))
if multirow:
crop_box = (
max(0, left - pad_x),
max(0, top - pad_y),
min(source.width, right + pad_x),
min(source.height, bottom + pad_y),
)
elif frame_count == 1:
crop_box = (0, 0, source.width, source.height)
else:
# Preserve vertical motion for true one-row strips (jumping,
# bobbing) while still narrowing X around the object.
crop_box = (max(0, left - pad_x), 0, min(source.width, right + pad_x), source.height)
frame = Image.new("RGBA", (crop_box[2] - crop_box[0], crop_box[3] - crop_box[1]), (0, 0, 0, 0))
rel = (left - crop_box[0], top - crop_box[1], right - crop_box[0], bottom - crop_box[1])
frame.alpha_composite(source.crop((left, top, right, bottom)), (rel[0], rel[1]))
# The global component pass already chose the subject box. Do not run
# another component filter here: capes/tails can be legitimate
# disconnected lobes inside the chosen subject box.
frames.append(frame)
return frames
return attempt(strip) or attempt(_erase_long_axis_lines(strip))
def _sever_expected_gutters(strip, frame_count: int):
"""Cut thin vertical gutters at expected frame boundaries before labeling.
@ -397,17 +687,23 @@ def _sever_expected_gutters(strip, frame_count: int):
return out
def _slot_crops(strip, frame_count: int) -> list:
def _slot_crops(strip, frame_count: int, *, require_padding: bool = False) -> list | None:
"""Slice *strip* into *frame_count* uniform columns (one coordinate space).
Equal-width columns keep every frame in a single shared coordinate frame, so
a later union-crop + shared placement (:func:`normalize_cells`) preserves the
row's real motion without the per-frame re-centering that makes a pet visibly
slide. Neighbour side-bleed is trimmed per column.
slide. Each slot is cleaned independently so detached effects, floors,
dividers, and neighbour slivers do not become "frames".
"""
w0 = max(1, strip.width // frame_count)
h = strip.height
return [_drop_side_bleed(strip.crop((i * w0, 0, i * w0 + w0, h))) for i in range(frame_count)]
frames = []
for left, right in _slot_bounds(strip.width, frame_count):
slot = _drop_side_bleed(_isolate_slot_subject(strip.crop((left, 0, right, h))))
if require_padding and not _has_slot_padding(slot):
return None
frames.append(slot)
return frames
def _content_runs(profile: list[int], *, threshold: int = 2) -> list[tuple[int, int]]:
@ -465,6 +761,52 @@ def _frame_x_ranges(strip, frame_count: int) -> list[tuple[int, int]] | None:
return [(l, r) for l, r in groups]
def _significant_subject_boxes(image) -> list[tuple[int, int, int, int]]:
comps = _component_boxes(image)
if not comps:
return []
max_mass = max(mass for _box, mass in comps)
return _merge_related_boxes([box for box, mass in comps if mass >= max(32, max_mass * 0.12)])
def _validate_extracted_frames(frames: list, frame_count: int) -> None:
"""Reject rows where one "frame" is really multiple poses.
A bad provider roll can collapse a strip into tiny repeated poses. If we let
that through, normalization sees a huge motion envelope and shrinks the
entire pet to postage-stamp size. Catch the row here so hatch can regenerate
it instead of saving a technically non-empty but visually broken atlas.
"""
if len(frames) != frame_count:
raise ValueError(f"expected {frame_count} frames, got {len(frames)}")
boxes = []
for i, frame in enumerate(frames):
bbox = frame.getbbox()
if bbox is None:
raise ValueError(f"frame {i} is empty")
subjects = _significant_subject_boxes(frame)
if len(subjects) >= 3:
raise ValueError(f"frame {i} contains multiple separated subjects")
boxes.append(bbox)
if frame_count <= 1:
return
widths = sorted(b[2] - b[0] for b in boxes)
heights = sorted(b[3] - b[1] for b in boxes)
med_w = max(1, widths[len(widths) // 2])
med_h = max(1, heights[len(heights) // 2])
for i, (left, top, right, bottom) in enumerate(boxes):
width = right - left
height = bottom - top
# A legitimate wing/arm can be wider than the median pose. A frame that is
# several times wider while not proportionally taller is usually multiple
# mini-poses packed into one accepted frame.
if width > max(med_w * 3.0, med_w + 96) and height <= med_h * 1.6:
raise ValueError(f"frame {i} is a multi-pose width outlier")
def extract_strip_frames(
strip,
frame_count: int,
@ -475,15 +817,15 @@ def extract_strip_frames(
) -> list:
"""Turn one generated row strip into *frame_count* frames.
The background is keyed out, thin connecting bridges at the expected
boundaries are severed, then the strip is sliced at its empty chroma gutters
(:func:`_frame_x_ranges`) the plain "find each object, make a frame" cut
that works once poses are spaced apart (which generation now enforces).
The background is keyed out, then strict extraction treats the requested
frame count as the source of truth: slice known equal slots, isolate the real
subject in each slot, and require empty padding on X and Y. Empty chroma
gutters are only a lenient salvage fallback.
Each frame is cropped at full cell height so tall ears / halos are never
clipped; :func:`_drop_side_bleed` trims any faint neighbour sliver. When the
poses are touching (fewer gutters than frames) ``components`` raises and
``auto`` falls back to equal-width slots.
clipped; detached effects and neighbour slivers are dropped per slot. When a
pose does not have required space around it, ``components`` raises and
``auto`` falls back to best-effort slicing.
*fit* (default) fits+centers each frame into a 192x208 cell the standalone
contract for callers that don't normalize. Hatching passes ``fit=False`` to
@ -500,28 +842,38 @@ def extract_strip_frames(
strip = remove_background(strip, chroma_key=chroma_key)
# Prefer the real gutters as-is: when poses are already spaced (generation
# enforces this), slicing the strip untouched keeps each pose's own bounds and
# never cuts through an unevenly-placed silhouette. Only fall back to severing
# the expected boundaries when gaps alone can't separate the row — i.e. poses
# are bridged by a shared shadow/glow/1px line and read as one blob.
source = strip
ranges = _frame_x_ranges(source, frame_count)
if ranges is None:
source = _sever_expected_gutters(strip, frame_count)
ranges = _frame_x_ranges(source, frame_count)
if ranges is None:
# Strict path: count actual non-background subjects first. This handles both
# the intended one-row strip and model-cheated 2D grids without ever stacking
# two visual rows into one frame.
frames = _component_crops(strip, frame_count, require_padding=True)
if frames is None:
frames = _slot_crops(strip, frame_count, require_padding=True)
if frames is None:
if method == "components":
raise ValueError(f"could not segment {frame_count} sprites from strip")
frames = _slot_crops(source, frame_count)
else:
h = source.height
pad = max(2, min(16, round((source.width / max(1, frame_count)) * 0.04)))
frames = [
_drop_side_bleed(source.crop((max(0, left - pad), 0, min(source.width, right + pad), h)))
for left, right in ranges
]
raise ValueError(f"could not segment {frame_count} padded sprites from strip")
# Lenient salvage for the final attempt: prefer real gutters when they
# exist, then sever expected boundaries, then fall back to raw slots. Still
# try object extraction first, just without edge-padding enforcement, so
# cached/borderline model rolls can be inspected without stacking a 2D grid.
frames = _component_crops(strip, frame_count, require_padding=False)
if frames is None:
source = strip
ranges = _frame_x_ranges(source, frame_count)
if ranges is None:
source = _sever_expected_gutters(strip, frame_count)
ranges = _frame_x_ranges(source, frame_count)
if ranges is None:
frames = _slot_crops(source, frame_count, require_padding=False) or []
else:
h = source.height
pad = max(2, min(16, round((source.width / max(1, frame_count)) * 0.04)))
frames = [
_drop_side_bleed(_isolate_slot_subject(source.crop((max(0, left - pad), 0, min(source.width, right + pad), h))))
for left, right in ranges
]
_validate_extracted_frames(frames, frame_count)
return [_fit_to_cell(f) for f in frames] if fit else frames
@ -746,22 +1098,72 @@ def validate_atlas(atlas) -> dict:
return {"ok": False, "width": atlas.width, "height": atlas.height, "errors": errors, "warnings": warnings, "filled_states": []}
filled_states: list[str] = []
cell_boxes_by_state: dict[str, list[tuple[int, int, int, int]]] = {}
for state, row, count in ROW_SPECS:
row_pixels = 0
boxes: list[tuple[int, int, int, int]] = []
for col in range(count):
left = col * CELL_WIDTH
top = row * CELL_HEIGHT
cell = atlas.crop((left, top, left + CELL_WIDTH, top + CELL_HEIGHT))
nonblank = sum(cell.getchannel("A").histogram()[1:])
row_pixels += nonblank
bbox = cell.getbbox()
if bbox is not None:
boxes.append(bbox)
if row_pixels > 0:
filled_states.append(state)
cell_boxes_by_state[state] = boxes
else:
warnings.append(f"state '{state}' has no frames")
if not filled_states:
errors.append("atlas is empty — no state produced any frames")
# A visually valid pet must occupy the cell. A single bad row can otherwise
# poison global normalization and shrink every state to a tiny postage stamp
# while still passing the old "non-empty cells" check.
all_widths = sorted(
right - left
for boxes in cell_boxes_by_state.values()
for left, _top, right, _bottom in boxes
)
all_heights = sorted(
bottom - top
for boxes in cell_boxes_by_state.values()
for _left, top, _right, bottom in boxes
)
global_med_w = 0
global_med_h = 0
if all_widths and all_heights:
global_med_w = all_widths[len(all_widths) // 2]
median_h = all_heights[len(all_heights) // 2]
global_med_h = median_h
min_h = max(56, round(CELL_HEIGHT * 0.28))
if median_h < min_h:
errors.append(f"atlas sprites are too small after normalization (median frame height {median_h}px)")
for state, boxes in cell_boxes_by_state.items():
if len(boxes) <= 1:
continue
widths = sorted(right - left for left, _top, right, _bottom in boxes)
heights = sorted(bottom - top for _left, top, _right, bottom in boxes)
med_w = max(1, widths[len(widths) // 2])
med_h = max(1, heights[len(heights) // 2])
max_w = widths[-1]
max_h = heights[-1]
if max_w > max(med_w * 3.0, med_w + 96) and max_h <= med_h * 1.6:
errors.append(f"state '{state}' contains a multi-pose frame outlier")
# Per-state collapse guard: one malformed row (tiny slivers / chopped
# fragments) should not pass because other rows are healthy.
if global_med_w and global_med_h:
min_state_w = max(32, round(global_med_w * 0.42))
min_state_h = max(40, round(global_med_h * 0.50))
if med_w < min_state_w or med_h < min_state_h:
errors.append(
f"state '{state}' appears collapsed (median {med_w}x{med_h}px, global median {global_med_w}x{global_med_h}px)"
)
# Transparent pixels must carry zero RGB (no halo residue).
data = atlas.tobytes()
residue = 0

View file

@ -14,29 +14,40 @@ producing an ungrounded, drifting pet.
from __future__ import annotations
import logging
import os
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
# Providers that can ground generation on a reference image.
# openrouter / nous reach Gemini Flash Image (and friends) over the
# OpenRouter-compatible chat-completions image protocol, which accepts
# reference images for grounding. Nous Portal proxies OpenRouter, so both
# qualify.
_REF_CAPABLE = ("openai", "openai-codex", "krea", "openrouter", "nous")
# Providers that can ground generation on a reference image, in preference order
# (Nous Portal → OpenAI → OpenRouter → …). OpenRouter/Nous run a quality-first
# model chain and may fall back depending on account access and endpoint behavior,
# so fidelity can vary by configured backend + model availability.
_REF_CAPABLE = ("nous", "openai", "openai-codex", "openrouter", "krea")
# Friendly label + one-line speed/quality note per reference-capable provider,
# surfaced in the desktop pet-gen picker so users can trade speed for fidelity.
_PROVIDER_META: dict[str, dict[str, str]] = {
"nous": {"label": "Nous Portal", "note": "Fast, balanced quality"},
"openrouter": {"label": "OpenRouter", "note": "Fastest — Gemini Flash Image"},
"openai": {"label": "OpenAI", "note": "Highest fidelity, slower"},
"openai-codex": {"label": "OpenAI (Codex)", "note": "Highest fidelity, slower"},
"krea": {"label": "Krea", "note": "Stylized, style-reference grounding"},
# Friendly display label per reference-capable provider, surfaced in the desktop
# pet-gen picker.
_PROVIDER_LABELS: dict[str, str] = {
"nous": "Nous Portal",
"openrouter": "OpenRouter",
"openai": "OpenAI",
"openai-codex": "OpenAI (Codex)",
"krea": "Krea",
}
def _forced_provider_from_env() -> str | None:
"""Optional QA override to force a pet-gen backend.
`HERMES_PET_IMAGE_PROVIDER=<name>` (e.g. `openrouter`) bypasses the normal
active/default provider resolution for pet generation only. Unknown values are
ignored so existing users are unaffected.
"""
forced = os.environ.get("HERMES_PET_IMAGE_PROVIDER", "").strip().lower()
return forced if forced in _REF_CAPABLE else None
class GenerationError(RuntimeError):
"""Raised on any image-generation failure (no provider, API error, IO)."""
@ -71,6 +82,14 @@ def resolve_provider(*, require_references: bool = True, prefer: str | None = No
_discover()
from agent.image_gen_registry import get_active_provider, get_provider
# QA override: force one provider for pet-gen iteration regardless of the
# globally active image_gen backend.
forced = _forced_provider_from_env()
if forced:
chosen = get_provider(forced)
if chosen is not None and chosen.is_available():
return SpriteProvider(name=forced, provider=chosen, supports_references=True)
# An explicit user pick wins when it's reference-capable and has credentials;
# otherwise we ignore it and fall through to the normal resolution.
if prefer:
@ -110,10 +129,11 @@ def resolve_provider(*, require_references: bool = True, prefer: str | None = No
def list_sprite_providers() -> list[dict]:
"""The reference-capable providers available to pick for pet generation.
Returns ``[{name, label, note, default}]`` for every ref-capable provider the
user actually has credentials for, marking the one :func:`resolve_provider`
would choose with no explicit preference. Empty when none is configured (the
picker hides itself). Best-effort: discovery hiccups yield an empty list.
Returns ``[{name, label, default}]`` for every ref-capable provider the user
actually has credentials for, in preference order, marking the one
:func:`resolve_provider` would choose with no explicit preference. Empty when
none is configured (the picker hides itself). Best-effort: discovery hiccups
yield an empty list.
"""
_discover()
from agent.image_gen_registry import get_provider
@ -128,12 +148,10 @@ def list_sprite_providers() -> list[dict]:
provider = get_provider(name)
if provider is None or not provider.is_available():
continue
meta = _PROVIDER_META.get(name, {})
out.append(
{
"name": name,
"label": meta.get("label", name),
"note": meta.get("note", ""),
"label": _PROVIDER_LABELS.get(name, name),
"default": name == default_name,
}
)

View file

@ -38,7 +38,7 @@ _MAX_PARALLEL_GENERATIONS = 4
# How many times to (re)generate a single row before accepting a best-effort
# slice. Early attempts demand clean per-pose gutters; the last is lenient so a
# stubborn row still yields frames instead of dropping out entirely.
_ROW_GEN_ATTEMPTS = 2
_ROW_GEN_ATTEMPTS = 3
_MIN_FILLED_STATES = 6
_REQUIRED_STATES = frozenset({"idle", "running-right", "waving"})

View file

@ -63,12 +63,14 @@ _STYLE_HINTS: dict[str, str] = {
}
_BACKGROUND = (
"Center one full-body character on a flat, uniform, high-contrast chroma-key "
"background (prefer pure hot magenta #FF00FF unless that color appears on "
"the character). The background must completely surround the character: one "
"even color with NO gradient, vignette, texture, pattern, scenery, shadow, "
"ground line, frame, or border, so it keys out cleanly. The background color "
"must not appear anywhere on the character itself. No text, no labels."
"Center the character on a SINGLE flat, uniform, high-contrast chroma-key "
"background — pure hot magenta #FF00FF (only if magenta appears on the "
"character, use pure green #00FF00 instead). The background is ONE continuous "
"even color that completely surrounds the character with NO gradient, "
"vignette, texture, pattern, scenery, shadow, ground line, frame, border, "
"panel, comic cell, gutter line, grid, or divider of any kind, so it keys out "
"cleanly. The background color must not appear anywhere on the character. "
"No text, no labels, no speech bubbles, no UI."
)
@ -149,8 +151,12 @@ def build_row_prompt(state: str, frame_count: int, concept: str, *, style: str |
f"(same species, face, colors, markings, proportions, and props), "
"preserving the same emotional tone/mood (e.g., scary stays scary, cute stays cute), "
f"draw a single WIDE horizontal strip of {frame_count} animation frames showing {action}. "
f"LAYOUT: split the wide strip into {frame_count} equal vertical cells, one "
"pose centered in each cell. "
f"LAYOUT: arrange {frame_count} poses in ONE horizontal row at equal spacing, "
"each pose centered in its own imaginary equal region. Draw NO panel borders, "
"NO comic cells, NO boxes, NO vertical divider/gutter lines, NO grid, NO frame "
"outlines between poses — the backdrop is one unbroken flat field behind all of them. "
"Fill the WHOLE strip with the SAME single flat chroma-key color as the attached "
"reference image's background (identical hue in every frame, no per-pose color shifts). "
f"SPACING (critical): draw each pose at a consistent, healthy, clearly "
f"visible size (roughly {pose_px}px wide on a {_ASSUMED_STRIP_WIDTH}px "
f"strip) — do NOT shrink it tiny — but keep its ENTIRE silhouette "
@ -166,8 +172,9 @@ def build_row_prompt(state: str, frame_count: int, concept: str, *, style: str |
# so only the action moves — this is what stops the loop sliding/pulsing.
"REGISTRATION (critical): the character is the SAME height and SAME width "
"in every frame, drawn at the SAME scale, centered over the SAME point, "
"with all feet resting on ONE shared horizontal ground line across the "
"whole strip. Keep the body's center, size, and stance fixed frame to "
"with all feet aligned to the SAME invisible horizontal baseline across the "
"whole strip — this baseline is conceptual ONLY: draw NO ground line, floor, "
"platform, horizon, or contact shadow beneath the feet. Keep the body's center, size, and stance fixed frame to "
"frame — ONLY the limbs/features the action needs may move. Capes, cloaks, "
"bags, and scarves stay in the SAME place and shape every frame (no "
"swinging, flowing, or drifting) unless the action itself requires it. No "

View file

@ -5,8 +5,9 @@ import { Check, ChevronDown } from '@/lib/icons'
import { $petGenProvider, $petGenProviders, setPetGenProvider } from '@/store/pet-generate'
// Image-backend picker for pet generation — the composer's model-pill pattern:
// a quiet trigger + a dropdown of options, each with a one-line speed/quality
// note. Hidden unless there are 2+ reference-capable backends (nothing to pick).
// a quiet trigger + a dropdown of options. No per-option notes: every backend
// resolves to the same faithful OpenAI image model, so there's no tradeoff to
// describe. Hidden unless there are 2+ reference-capable backends (nothing to pick).
export function ProviderPicker() {
const providers = useStore($petGenProviders)
const picked = useStore($petGenProvider)
@ -32,19 +33,16 @@ export function ProviderPicker() {
</DropdownMenuTrigger>
{/* The picker lives inside the pet-gen Dialog (z-130) and portals to body,
so lift its menu above the dialog or it opens behind it. */}
<DropdownMenuContent align="start" className="z-[140] w-56">
<DropdownMenuContent align="start" className="z-[140]">
{providers.map(provider => (
<DropdownMenuItem
className="flex-col items-start gap-0.5"
className="flex items-center gap-1.5"
key={provider.name}
// Picking the default clears the override (no need to pin it).
onSelect={() => setPetGenProvider(provider.default ? '' : provider.name)}
>
<span className="flex w-full items-center gap-1.5">
<span className="min-w-0 flex-1 truncate font-medium text-foreground">{provider.label}</span>
{provider.name === current?.name && <Check className="size-3.5 text-primary" />}
</span>
{provider.note && <span className="text-[0.6875rem] text-(--ui-text-tertiary)">{provider.note}</span>}
<span className="min-w-0 flex-1 truncate font-medium text-foreground">{provider.label}</span>
{provider.name === current?.name && <Check className="size-3.5 text-primary" />}
</DropdownMenuItem>
))}
</DropdownMenuContent>

View file

@ -62,10 +62,11 @@ export function PetGenerateOverlay() {
// The footer banner narrates the dialog's async state: the failure reason on a
// dead-end error, else the "you can close this, we'll notify you" reassurance
// while a generate/hatch runs in the background.
// while a generate/hatch runs in the background. On step 1, show a neutral ETA.
const working = status === 'generating' || status === 'hatching'
const errored = status === 'error' && drafts.length === 0
const banner = errored ? error || copy.genericError : working ? copy.backgroundHint : undefined
const stepOne = status === 'idle' || status === 'ready'
const banner = errored ? error || copy.genericError : working ? copy.backgroundHint : stepOne ? copy.slowProviderHint : undefined
return (
<Dialog onOpenChange={handleOpenChange} open={open}>

View file

@ -798,6 +798,7 @@ export const en: Translations = {
namePlaceholder: 'Name your pet',
staleBackend: 'Update Hermes to generate pets.',
backgroundHint: 'You can close this — Hermes will notify you when its done.',
slowProviderHint: 'This can take up to 5 minutes',
genericError: 'Generation failed — try again or pick a suggestion.',
referenceImageTooLarge: 'Reference image is too large. Use one under 16 MB.',
referenceImageInvalid: 'Could not read that reference image. Try a PNG, JPG, WebP, or GIF.',

View file

@ -916,6 +916,7 @@ export const ja = defineLocale({
namePlaceholder: 'ペットに名前を付ける',
staleBackend: 'ペットを生成するには Hermes を更新してください。',
backgroundHint: 'このウィンドウは閉じても大丈夫です。完了したら Hermes が通知します。',
slowProviderHint: 'これには最大5分かかることがあります。',
genericError: '生成に失敗しました。もう一度試すか、候補を選んでください。',
referenceImageTooLarge: '参照画像が大きすぎます。16 MB 未満の画像を使ってください。',
referenceImageInvalid: '参照画像を読み込めませんでした。PNG/JPG/WebP/GIF を試してください。',

View file

@ -702,6 +702,7 @@ export interface Translations {
namePlaceholder: string
staleBackend: string
backgroundHint: string
slowProviderHint: string
genericError: string
referenceImageTooLarge: string
referenceImageInvalid: string

View file

@ -888,6 +888,7 @@ export const zhHant = defineLocale({
namePlaceholder: '為寵物命名',
staleBackend: '請更新 Hermes 以生成寵物。',
backgroundHint: '你可以關閉此視窗——完成後 Hermes 會通知你。',
slowProviderHint: '這可能最多需要 5 分鐘。',
genericError: '生成失敗——請重試或選一個建議。',
referenceImageTooLarge: '參考圖片過大。請使用小於 16 MB 的圖片。',
referenceImageInvalid: '無法讀取該參考圖片。請嘗試 PNG、JPG、WebP 或 GIF。',

View file

@ -986,6 +986,7 @@ export const zh: Translations = {
namePlaceholder: '给宠物起个名字',
staleBackend: '请更新 Hermes 以生成宠物。',
backgroundHint: '你可以关闭此窗口——完成后 Hermes 会通知你。',
slowProviderHint: '这可能最多需要 5 分钟。',
genericError: '生成失败——请重试或选择一个建议。',
referenceImageTooLarge: '参考图过大。请使用小于 16 MB 的图片。',
referenceImageInvalid: '无法读取该参考图。请尝试 PNG、JPG、WebP 或 GIF。',

View file

@ -96,6 +96,19 @@ describe('dispatchNativeNotification focus gating', () => {
dispatchNativeNotification({ kind: 'approval', sessionId: 'on-screen', title: 'approve' })
expect(notify).not.toHaveBeenCalled()
})
it('fires a global completion notification while away with no active session (pet gen)', () => {
setActiveSessionId(null)
dispatchNativeNotification({ global: true, kind: 'backgroundDone', title: 'Your pet hatched' })
expect(notify).toHaveBeenCalledTimes(1)
})
it('suppresses a global notification when the window is focused', () => {
setWindowState({ focused: true, hidden: false })
setActiveSessionId(null)
dispatchNativeNotification({ global: true, kind: 'backgroundDone', title: 'Your pet hatched' })
expect(notify).not.toHaveBeenCalled()
})
})
describe('dispatchNativeNotification preferences', () => {

View file

@ -113,7 +113,15 @@ function isBackgrounded(): boolean {
return typeof document.hasFocus === 'function' && !document.hasFocus()
}
function shouldFire(kind: NativeNotificationKind, sessionId?: null | string): boolean {
function shouldFire(kind: NativeNotificationKind, sessionId?: null | string, global = false): boolean {
// Global notifications aren't tied to a chat session (e.g. pet generation,
// which runs from the command center with no active conversation). They fire
// whenever the user is away, with no session-match requirement — otherwise a
// background run started without an open session would be silently dropped.
if (global) {
return isBackgrounded()
}
// Attention kinds break through for an off-screen session even while focused.
if (ATTENTION_KINDS.has(kind)) {
return isBackgrounded() || (Boolean(sessionId) && sessionId !== $activeSessionId.get())
@ -134,6 +142,12 @@ export interface NativeNotificationInput {
title: string
body?: string
sessionId?: null | string
/**
* Not tied to a chat session (e.g. pet generation). Fires whenever the user
* is away, bypassing the session-match gate that completion kinds normally
* require.
*/
global?: boolean
silent?: boolean
actions?: NativeNotificationAction[]
}
@ -145,11 +159,11 @@ export function dispatchNativeNotification(input: NativeNotificationInput): void
return
}
if (!shouldFire(input.kind, input.sessionId)) {
if (!shouldFire(input.kind, input.sessionId, input.global)) {
return
}
if (throttled(`${input.kind}:${input.sessionId ?? ''}`, Date.now())) {
if (throttled(`${input.kind}:${input.sessionId ?? (input.global ? 'global' : '')}`, Date.now())) {
return
}

View file

@ -6,8 +6,6 @@ import { dispatchNativeNotification } from '@/store/native-notifications'
import { notify } from '@/store/notifications'
import { type PetInfo } from '@/store/pet'
import { applyAdoptedPet, type GatewayRequest } from '@/store/pet-gallery'
import { $activeSessionId } from '@/store/session'
/**
* Feature store for the "generate a pet" flow (Cmd-K Pets Generate).
*
@ -111,8 +109,6 @@ export const $petGenAvailable = atom<boolean | null>(null)
export interface PetGenProvider {
name: string
label: string
/** One-line speed/quality tradeoff note. */
note: string
/** Whether this is the backend's default pick (no override needed). */
default: boolean
}
@ -227,7 +223,10 @@ function notifyPetGenDone(title: string, message: string, kind: 'error' | 'succe
}
notify({ kind, title, message, action: { label: 'View', onClick: openPetGenerate } })
dispatchNativeNotification({ kind: 'backgroundDone', title, body: message, sessionId: $activeSessionId.get() })
// Pet generation isn't tied to a chat session — mark it global so the OS
// notification fires whenever the user is away, even with no active session
// (the common case: generating from the command center with no conversation).
dispatchNativeNotification({ kind: 'backgroundDone', title, body: message, global: true })
}
interface GenerateOptions {

View file

@ -3,7 +3,7 @@
Both OpenRouter and the Nous Portal inference endpoint speak the same
OpenAI-style ``/chat/completions`` image-generation protocol: send
``modalities: ["image", "text"]`` with an image-output model (e.g.
``google/gemini-2.5-flash-image``), pass reference images as ``image_url``
``google/gemini-3-pro-image``), pass reference images as ``image_url``
content parts for grounding, and read the generated images back from
``choices[0].message.images[].image_url.url`` (a ``data:image/...;base64`` URI).
@ -40,10 +40,17 @@ from agent.image_gen_provider import (
logger = logging.getLogger(__name__)
# Default image-output model. Gemini 2.5 Flash Image ("nano-banana") is GA on
# OpenRouter, accepts reference images for grounding, and honors
# ``image_config.aspect_ratio``.
DEFAULT_MODEL = "google/gemini-2.5-flash-image"
# Quality-first model chain for OpenRouter-compatible endpoints.
#
# Default behavior (no env/config override): try the highest-fidelity OpenAI
# image model first, then fall back to Gemini 3 Pro Image if the OpenAI model
# is access-gated / unavailable / times out on this endpoint.
#
# Explicit override (OPENROUTER_IMAGE_MODEL or image_gen.<provider>.model):
# use exactly that model (no auto fallback), so power users keep full control.
DEFAULT_MODEL = "openai/gpt-5.4-image-2"
_FALLBACK_MODEL = "google/gemini-3-pro-image"
_DEFAULT_MODEL_CHAIN = (DEFAULT_MODEL, _FALLBACK_MODEL)
# Semantic aspect ratio (the image_gen contract) → OpenRouter's image_config
# aspect_ratio strings.
@ -121,6 +128,43 @@ def _extract_images(payload: Dict[str, Any]) -> List[str]:
return out
def _access_error_hint(
display: str, model_id: str, env_var: str, status: int, err_msg: str
) -> Optional[str]:
"""A targeted hint when an access-gated OpenAI image model can't be reached.
Some OpenAI image models on OpenRouter need account enablement / BYOK, so the
failure isn't a missing key (the key is valid) — the *model* is unreachable.
The generic "check your key" message is misleading there, so we detect that
case and point the user at the real fix. Returns one actionable line, or
``None`` when this isn't the access-gated case.
"""
if not model_id.startswith("openai/"):
return None
low = (err_msg or "").lower()
gated = status in (402, 403, 404) or any(
s in low for s in ("no endpoints", "no allowed", "not a valid model", "data policy")
)
if not gated:
return None
return (
f"{display} can't reach image model '{model_id}' ({status}) — enable OpenAI "
f"image access in your {display} account, or set {env_var}={_FALLBACK_MODEL}."
)
def _dedupe_models(models: list[str]) -> list[str]:
out: list[str] = []
seen: set[str] = set()
for model in models:
m = (model or "").strip()
if not m or m in seen:
continue
seen.add(m)
out.append(m)
return out
class OpenRouterCompatImageProvider(ImageGenProvider):
"""Image generation over an OpenRouter-compatible chat-completions endpoint.
@ -180,9 +224,14 @@ class OpenRouterCompatImageProvider(ImageGenProvider):
return [
{
"id": DEFAULT_MODEL,
"display": "Gemini 2.5 Flash Image (nano-banana)",
"strengths": "Reference-grounded edits; aspect-ratio control",
}
"display": "OpenAI GPT-5.4 Image 2",
"strengths": "Highest fidelity; best prompt adherence; slower on OpenRouter",
},
{
"id": _FALLBACK_MODEL,
"display": "Gemini 3 Pro Image",
"strengths": "Fast, reliable fallback with good layout adherence",
},
]
def default_model(self) -> Optional[str]:
@ -193,16 +242,24 @@ class OpenRouterCompatImageProvider(ImageGenProvider):
def _resolve_model(self) -> str:
"""Pick the image model: env override → config → :data:`DEFAULT_MODEL`."""
return self._resolve_model_chain()[0]
def _resolve_model_chain(self) -> list[str]:
"""Ordered model attempts for this request.
Explicit user/model config means "use this exact model", so no fallback.
Without overrides we run the quality-first default chain.
"""
env_override = os.environ.get(self._model_env_var, "").strip()
if env_override:
return env_override
return [env_override]
cfg = _load_image_gen_config()
scoped = cfg.get(self._config_key) if isinstance(cfg.get(self._config_key), dict) else {}
if isinstance(scoped, dict):
value = scoped.get("model")
if isinstance(value, str) and value.strip():
return value.strip()
return DEFAULT_MODEL
return [value.strip()]
return _dedupe_models(list(_DEFAULT_MODEL_CHAIN))
def generate(
self,
@ -237,7 +294,7 @@ class OpenRouterCompatImageProvider(ImageGenProvider):
aspect_ratio=aspect_ratio,
)
model_id = self._resolve_model()
model_chain = self._resolve_model_chain()
aspect = resolve_aspect_ratio(aspect_ratio)
or_aspect = _ASPECT_RATIOS.get(aspect, "1:1")
@ -258,12 +315,6 @@ class OpenRouterCompatImageProvider(ImageGenProvider):
if part:
content.append({"type": "image_url", "image_url": {"url": part}})
payload: Dict[str, Any] = {
"model": model_id,
"modalities": ["image", "text"],
"messages": [{"role": "user", "content": content}],
"image_config": {"aspect_ratio": or_aspect},
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
@ -271,102 +322,145 @@ class OpenRouterCompatImageProvider(ImageGenProvider):
"HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
"X-Title": "Hermes Agent",
}
try:
response = requests.post(
f"{base_url}/chat/completions",
headers=headers,
json=payload,
timeout=_REQUEST_TIMEOUT,
)
response.raise_for_status()
except requests.HTTPError as exc:
resp = exc.response
status = resp.status_code if resp is not None else 0
last_error: Optional[Dict[str, Any]] = None
for i, model_id in enumerate(model_chain):
payload: Dict[str, Any] = {
"model": model_id,
"modalities": ["image", "text"],
"messages": [{"role": "user", "content": content}],
"image_config": {"aspect_ratio": or_aspect},
}
is_last = i == len(model_chain) - 1
try:
err_msg = resp.json().get("error", {}).get("message", resp.text[:300])
except Exception: # noqa: BLE001
err_msg = resp.text[:300] if resp is not None else str(exc)
logger.error("%s image gen failed (%d): %s", self._name, status, err_msg)
return error_response(
error=f"{self._display} image generation failed ({status}): {err_msg}",
error_type="api_error",
provider=self._name,
response = requests.post(
f"{base_url}/chat/completions",
headers=headers,
json=payload,
timeout=_REQUEST_TIMEOUT,
)
response.raise_for_status()
except requests.HTTPError as exc:
resp = exc.response
status = resp.status_code if resp is not None else 0
try:
err_msg = resp.json().get("error", {}).get("message", resp.text[:300])
except Exception: # noqa: BLE001
err_msg = resp.text[:300] if resp is not None else str(exc)
logger.error("%s image gen failed (%d) on %s: %s", self._name, status, model_id, err_msg)
hint = _access_error_hint(self._display, model_id, self._model_env_var, status, err_msg)
if hint and not is_last:
logger.info(
"%s model %s unavailable; retrying with fallback %s",
self._name,
model_id,
model_chain[i + 1],
)
continue
last_error = error_response(
error=hint or f"{self._display} image generation failed ({status}): {err_msg}",
error_type="model_access" if hint else "api_error",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
return last_error
except requests.Timeout:
if not is_last:
logger.info(
"%s model %s timed out; retrying with fallback %s",
self._name,
model_id,
model_chain[i + 1],
)
continue
return error_response(
error=f"{self._display} image generation timed out "
f"({int(_REQUEST_TIMEOUT)}s)",
error_type="timeout",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
except requests.ConnectionError as exc:
return error_response(
error=f"{self._display} connection error: {exc}",
error_type="connection_error",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
try:
result = response.json()
except Exception as exc: # noqa: BLE001
return error_response(
error=f"{self._display} returned invalid JSON: {exc}",
error_type="invalid_response",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
images = _extract_images(result)
if not images:
if not is_last:
logger.info(
"%s model %s returned no image; retrying with fallback %s",
self._name,
model_id,
model_chain[i + 1],
)
continue
# A response with text but no image usually means the model didn't
# honor image output (wrong model or modalities); surface that.
return error_response(
error=(
f"{self._display} returned no image. Ensure the model "
f"'{model_id}' supports image output."
),
error_type="empty_response",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
first = images[0]
try:
if first.startswith("data:"):
b64 = first.split(",", 1)[1] if "," in first else ""
saved_path = save_b64_image(b64, prefix=f"{self._name}_gen")
else:
saved_path = save_url_image(first, prefix=f"{self._name}_gen")
except Exception as exc: # noqa: BLE001
return error_response(
error=f"Could not save generated image: {exc}",
error_type="io_error",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
return success_response(
image=str(saved_path),
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
except requests.Timeout:
return error_response(
error=f"{self._display} image generation timed out "
f"({int(_REQUEST_TIMEOUT)}s)",
error_type="timeout",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
except requests.ConnectionError as exc:
return error_response(
error=f"{self._display} connection error: {exc}",
error_type="connection_error",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
try:
result = response.json()
except Exception as exc: # noqa: BLE001
return error_response(
error=f"{self._display} returned invalid JSON: {exc}",
error_type="invalid_response",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
images = _extract_images(result)
if not images:
# A response with text but no image usually means the model didn't
# honor image output (wrong model or modalities); surface that.
return error_response(
error=(
f"{self._display} returned no image. Ensure the model "
f"'{model_id}' supports image output."
),
error_type="empty_response",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
first = images[0]
try:
if first.startswith("data:"):
b64 = first.split(",", 1)[1] if "," in first else ""
saved_path = save_b64_image(b64, prefix=f"{self._name}_gen")
else:
saved_path = save_url_image(first, prefix=f"{self._name}_gen")
except Exception as exc: # noqa: BLE001
return error_response(
error=f"Could not save generated image: {exc}",
error_type="io_error",
provider=self._name,
model=model_id,
prompt=prompt,
aspect_ratio=aspect,
)
return success_response(
image=str(saved_path),
model=model_id,
return last_error or error_response(
error=f"{self._display} image generation failed after trying all candidate models.",
error_type="api_error",
provider=self._name,
model=model_chain[-1] if model_chain else "",
prompt=prompt,
aspect_ratio=aspect,
provider=self._name,
)

View file

@ -52,6 +52,21 @@ def test_extract_strip_frames_keys_out_solid_background():
assert frames[0].getpixel((0, 0))[3] == 0
def test_remove_background_defringes_antialiased_edge():
# The contaminated antialiased ring where sprite meets backdrop survives the
# key (it's a blend, too far from pure magenta). Defringe shaves that 1px ring:
# the keyed silhouette comes back eroded ~1px on every side, core intact.
img = Image.new("RGBA", (200, 200), (255, 0, 255, 255))
draw = ImageDraw.Draw(img)
draw.rectangle((50, 50, 149, 149), fill=(40, 200, 60, 255)) # 100x100 green
keyed = atlas.remove_background(img)
bbox = keyed.getbbox()
assert bbox is not None
w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
assert 96 <= w <= 99 and 96 <= h <= 99 # ~1px shaved per side
assert keyed.getpixel((100, 100))[3] > 0 # core intact
def test_remove_background_clears_trapped_chroma_pocket():
# Green body enclosing a magenta pocket (the "pink between the arm" case):
# the pocket isn't border-reachable, so it must be cleared by interior seeding.
@ -106,6 +121,47 @@ def test_extract_strip_frames_drops_small_side_lobes_from_adjacent_frames():
assert right_edge_mass == 0
def test_extract_strip_frames_drops_detached_slot_effects():
img = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
draw.ellipse((72, 54, 148, 172), fill=(70, 190, 70, 255)) # subject
draw.polygon([(10, 76), (16, 84), (24, 78), (18, 88)], fill=(255, 255, 160, 255)) # sparkle
frame = atlas.extract_strip_frames(img, 1, method="components", fit=False)[0]
bbox = frame.getbbox()
assert bbox is not None
assert bbox[0] > 40 # detached sparkle was removed
def test_extract_strip_frames_requires_slot_padding_in_strict_mode():
img = Image.new("RGBA", (atlas.CELL_WIDTH * 2, atlas.CELL_HEIGHT), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
# Frame 0 touches the top edge; strict mode should reject the row so the
# caller regenerates instead of accepting a clipped pet frame.
draw.rectangle((40, 0, 120, 130), fill=(70, 190, 70, 255))
draw.rectangle((atlas.CELL_WIDTH + 40, 40, atlas.CELL_WIDTH + 120, 170), fill=(70, 190, 70, 255))
with pytest.raises(ValueError):
atlas.extract_strip_frames(img, 2, method="components", fit=False)
def test_extract_strip_frames_rejects_multi_pose_frame_outlier():
frames = []
for _ in range(3):
frame = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0))
ImageDraw.Draw(frame).rectangle((82, 120, 108, 178), fill=(220, 240, 255, 255))
frames.append(frame)
bad = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0))
draw = ImageDraw.Draw(bad)
for x in (10, 50, 90, 130, 166):
draw.rectangle((x, 124, x + 12, 172), fill=(220, 240, 255, 255))
frames.append(bad)
with pytest.raises(ValueError, match="multiple separated subjects"):
atlas._validate_extracted_frames(frames, 4)
def test_extract_strip_frames_uses_real_gutters_when_spacing_is_uneven():
# gpt-image often returns a square chroma strip whose poses are separated but
# not laid out on exact equal-width slots. Equal slot slicing would include
@ -183,6 +239,35 @@ def test_validate_atlas_rejects_rgb_residue():
assert any("residue" in e for e in result["errors"])
def test_validate_atlas_rejects_postage_stamp_sprite():
sheet = Image.new("RGBA", (atlas.ATLAS_WIDTH, atlas.ATLAS_HEIGHT), (0, 0, 0, 0))
frame = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0))
ImageDraw.Draw(frame).rectangle((86, 174, 106, 201), fill=(220, 240, 255, 255))
for _state, row, count in atlas.ROW_SPECS:
for col in range(count):
sheet.alpha_composite(frame, (col * atlas.CELL_WIDTH, row * atlas.CELL_HEIGHT))
result = atlas.validate_atlas(sheet)
assert not result["ok"]
assert any("too small" in e for e in result["errors"])
def test_validate_atlas_rejects_one_collapsed_state_row():
frames = _frames_for_all_states()
tiny = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0))
draw = ImageDraw.Draw(tiny)
draw.rectangle((90, 150, 106, 199), fill=(220, 240, 255, 255))
frames["failed"] = [tiny.copy() for _ in range(atlas.FRAME_COUNTS["failed"])]
sheet = atlas.compose_atlas(frames)
result = atlas.validate_atlas(sheet)
assert not result["ok"]
assert any("appears collapsed" in e and "failed" in e for e in result["errors"])
def test_validate_atlas_warns_on_empty_state():
frames = _frames_for_all_states()
frames["jumping"] = []
@ -463,9 +548,12 @@ def test_list_sprite_providers_marks_default(monkeypatch):
listed = imagegen.list_sprite_providers()
names = {p["name"] for p in listed}
assert names == {"openai", "nous"}
# Every entry carries display metadata, and exactly one is the default.
assert all(p["label"] and "note" in p for p in listed)
# Every entry carries a display label (no quality note — all backends are equal).
assert all(p["label"] for p in listed)
assert all("note" not in p for p in listed)
assert [p["name"] for p in listed if p["default"]] == ["openai"]
# Listed in preference order: Nous Portal before OpenAI.
assert [p["name"] for p in listed] == ["nous", "openai"]
def test_generate_retries_without_transparent_background(monkeypatch, tmp_path):

View file

@ -99,11 +99,22 @@ class TestProviderClass:
with patch("plugins.image_gen.openrouter._load_image_gen_config", return_value={}):
assert _openrouter().default_model() == DEFAULT_MODEL
assert DEFAULT_MODEL == "google/gemini-2.5-flash-image"
# Default must be an image-output model id (provider/model form).
assert "/" in DEFAULT_MODEL and "image" in DEFAULT_MODEL
def test_default_chain_prefers_quality_then_fallback(self):
from plugins.image_gen.openrouter import _FALLBACK_MODEL, _DEFAULT_MODEL_CHAIN
with patch("plugins.image_gen.openrouter._load_image_gen_config", return_value={}):
chain = _openrouter()._resolve_model_chain()
assert chain == list(_DEFAULT_MODEL_CHAIN)
assert chain[0].startswith("openai/")
assert chain[-1] == _FALLBACK_MODEL
def test_model_env_override(self, monkeypatch):
monkeypatch.setenv("OPENROUTER_IMAGE_MODEL", "black-forest-labs/flux.2-pro")
assert _openrouter()._resolve_model() == "black-forest-labs/flux.2-pro"
assert _openrouter()._resolve_model_chain() == ["black-forest-labs/flux.2-pro"]
def test_model_config_override(self):
cfg = {"openrouter": {"model": "google/gemini-3.1-flash-image-preview"}}
@ -153,6 +164,30 @@ class TestHelpers:
assert _extract_images({"choices": [{"message": {"content": "no image"}}]}) == []
def test_access_error_hint_for_gated_openai_model(self):
from plugins.image_gen.openrouter import _FALLBACK_MODEL, _access_error_hint
hint = _access_error_hint(
"OpenRouter", "openai/gpt-5.4-image-2", "OPENROUTER_IMAGE_MODEL", 404, "No endpoints found"
)
assert hint is not None
assert "openai/gpt-5.4-image-2" in hint
assert "OPENROUTER_IMAGE_MODEL" in hint
assert _FALLBACK_MODEL in hint
# Stays a single line under the humanizer's 200-char truncation.
assert "\n" not in hint and len(hint) <= 200
def test_access_error_hint_ignores_non_openai_models(self):
from plugins.image_gen.openrouter import _access_error_hint
assert _access_error_hint("OpenRouter", "google/gemini-3-pro-image", "X", 404, "boom") is None
def test_access_error_hint_ignores_unrelated_errors(self):
from plugins.image_gen.openrouter import _access_error_hint
# A 200-class transient with an openai model but no access signal → no hint.
assert _access_error_hint("OpenRouter", "openai/gpt-5.4-image-2", "X", 500, "server error") is None
# ---------------------------------------------------------------------------
# generate()
@ -260,10 +295,11 @@ class TestGenerate:
resp.raise_for_status.side_effect = req_lib.HTTPError(response=resp)
with patch(_RUNTIME, return_value=_runtime_ok()), \
patch("requests.post", return_value=resp):
patch("requests.post", return_value=resp) as mock_post:
result = _openrouter().generate(prompt="a pet")
assert result["success"] is False
assert result["error_type"] == "api_error"
assert mock_post.call_count == 1
def test_timeout(self):
import requests as req_lib
@ -274,6 +310,55 @@ class TestGenerate:
assert result["success"] is False
assert result["error_type"] == "timeout"
def test_access_gated_model_surfaces_hint(self, monkeypatch):
"""A 404 on an OpenAI image model yields the actionable access hint (not
the misleading generic 'check your key' message)."""
import requests as req_lib
monkeypatch.setenv("OPENROUTER_IMAGE_MODEL", "openai/gpt-5.4-image-2")
resp = MagicMock()
resp.status_code = 404
resp.text = "No endpoints found for openai/gpt-5.4-image-2"
resp.json.return_value = {"error": {"message": "No endpoints found"}}
resp.raise_for_status.side_effect = req_lib.HTTPError(response=resp)
with patch(_RUNTIME, return_value=_runtime_ok()), \
patch("requests.post", return_value=resp) as mock_post:
result = _openrouter().generate(prompt="a pet")
assert result["success"] is False
assert result["error_type"] == "model_access"
assert "OpenAI image access" in result["error"]
assert mock_post.call_count == 1 # explicit override: no auto-fallback chain
def test_access_gated_default_model_falls_back_to_gemini(self):
import requests as req_lib
from plugins.image_gen.openrouter import DEFAULT_MODEL, _FALLBACK_MODEL
gated = MagicMock()
gated.status_code = 404
gated.text = f"No endpoints found for {DEFAULT_MODEL}"
gated.json.return_value = {"error": {"message": "No endpoints found"}}
gated.raise_for_status.side_effect = req_lib.HTTPError(response=gated)
with patch(_RUNTIME, return_value=_runtime_ok()), \
patch("requests.post", side_effect=[gated, _mock_chat_response([_PNG_DATA_URI])]) as mock_post, \
patch(
"plugins.image_gen.openrouter.save_b64_image",
return_value=Path("/tmp/openrouter_gen_fallback.png"),
):
result = _openrouter().generate(prompt="a pet")
assert result["success"] is True
assert result["model"] == _FALLBACK_MODEL
assert result["image"] == "/tmp/openrouter_gen_fallback.png"
assert mock_post.call_count == 2
first_model = mock_post.call_args_list[0].kwargs["json"]["model"]
second_model = mock_post.call_args_list[1].kwargs["json"]["model"]
assert first_model == DEFAULT_MODEL
assert second_model == _FALLBACK_MODEL
# ---------------------------------------------------------------------------
# Registration + pet integration