From e92b5c6af8bef10827a14a88986f2507af5e0255 Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Wed, 24 Jun 2026 23:01:31 -0500 Subject: [PATCH] feat(pets): quality-first OpenRouter model chain + stronger atlas gates + global pet-gen notifications OpenRouter/Nous image gen now runs a quality-first model chain by default: attempt the highest-fidelity OpenAI image model first, then fall back to Gemini 3 Pro Image when it's access-gated/unavailable/times out. An explicit OPENROUTER_IMAGE_MODEL / config model override pins one model with no fallback. Atlas validation rejects malformed model output instead of shipping it: adds a per-state collapse guard (a single sliver/fragment row no longer passes because other rows are healthy), on top of the existing postage-stamp + multi-pose checks. Desktop: pet-gen native notifications are now "global" (not tied to a chat session), so a background generation started from the command center fires an OS notification when the user is away even with no active session. Adds a neutral "This can take up to 5 minutes." banner on step 1, and lets the provider picker auto-size. Tests updated/added for the OpenRouter fallback chain, the collapse guard, and the global notification path. --- agent/pet/generate/atlas.py | 470 ++++++++++++++++-- agent/pet/generate/imagegen.py | 60 ++- agent/pet/generate/orchestrate.py | 2 +- agent/pet/generate/prompts.py | 27 +- .../components/provider-picker.tsx | 16 +- .../app/pet-generate/pet-generate-overlay.tsx | 5 +- apps/desktop/src/i18n/en.ts | 7 +- apps/desktop/src/i18n/ja.ts | 1 + apps/desktop/src/i18n/types.ts | 1 + apps/desktop/src/i18n/zh-hant.ts | 1 + apps/desktop/src/i18n/zh.ts | 1 + .../src/store/native-notifications.test.ts | 13 + .../desktop/src/store/native-notifications.ts | 20 +- apps/desktop/src/store/pet-generate.ts | 9 +- plugins/image_gen/openrouter/__init__.py | 304 +++++++---- tests/agent/test_pet_generate.py | 92 +++- .../test_openrouter_compat_provider.py | 89 +++- 17 files changed, 922 insertions(+), 196 deletions(-) diff --git a/agent/pet/generate/atlas.py b/agent/pet/generate/atlas.py index 2d316110e73..b631d79f359 100644 --- a/agent/pet/generate/atlas.py +++ b/agent/pet/generate/atlas.py @@ -127,6 +127,22 @@ def _near_key_mask(image, key: tuple[int, int, int], tol: int = 48): ) +def _defringe(rgba): + """Shave the 1px antialiased edge ring left after keying. + + Chroma keying can't catch the antialiased band where the sprite meets the + backdrop — those pixels are a key/sprite blend, too far from the key to be + removed, so they ring the cutout in magenta/green. Erode the alpha by one + pixel (a 3x3 min filter) to drop that contaminated ring; the sprite's own + thick dark outline keeps the silhouette intact. Built on a C-level filter, no + per-pixel Python. + """ + from PIL import ImageFilter + + rgba.putalpha(rgba.getchannel("A").filter(ImageFilter.MinFilter(3))) + return rgba + + def remove_background(image, *, chroma_key: tuple[int, int, int] | None = None, threshold: float = 90.0): """Return *image* (RGBA) with its flat background keyed out to transparent. @@ -163,7 +179,8 @@ def remove_background(image, *, chroma_key: tuple[int, int, int] | None = None, near = _near_key_mask(rgba, key) # L mask, 255 where near key opaque = rgba.getchannel("A").point(lambda a: 255 if a > _ALPHA_FLOOR else 0) remove_mask = ImageChops.darker(near, opaque) - return Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, remove_mask) + keyed = Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, remove_mask) + return _defringe(keyed) visited = bytearray(w * h) # Mark removals in a flat mask and apply them in one C composite at the end — @@ -209,7 +226,7 @@ def remove_background(image, *, chroma_key: tuple[int, int, int] | None = None, # One C-level composite instead of millions of per-pixel writes: paint the # flooded pixels to (0,0,0,0) wherever the mask is set. mask = Image.frombytes("L", (w, h), bytes(remove)).point(lambda v: 255 if v else 0) - return Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, mask) + return _defringe(Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, mask)) def _repair_internal_alpha_holes(image): @@ -369,6 +386,279 @@ def _drop_side_bleed(image): return rgba +def _erase_long_axis_lines(image): + """Remove thin slot-spanning guide/floor/divider lines. + + Gemini will sometimes satisfy "baseline" / "cell" language by drawing + literal horizontal floors or vertical panel dividers. They survive chroma + keying and connect otherwise clean poses. Drop only *thin* rows/columns that + span nearly the whole slot; thick sprite body rows are left alone. + """ + from PIL import Image + + rgba = image.convert("RGBA").copy() + w, h = rgba.size + alpha = rgba.getchannel("A") + + def _thin_groups(indices: list[int]) -> list[tuple[int, int]]: + groups: list[tuple[int, int]] = [] + start: int | None = None + prev: int | None = None + for idx in indices: + if start is None: + start = prev = idx + continue + if prev is not None and idx == prev + 1: + prev = idx + continue + if start is not None and prev is not None and prev - start + 1 <= 4: + groups.append((start, prev + 1)) + start = prev = idx + if start is not None and prev is not None and prev - start + 1 <= 4: + groups.append((start, prev + 1)) + return groups + + wide_rows = [ + y + for y in range(h) + if sum(1 for x in range(w) if alpha.getpixel((x, y)) > _ALPHA_FLOOR) >= w * 0.85 + ] + tall_cols = [ + x + for x in range(w) + if sum(1 for y in range(h) if alpha.getpixel((x, y)) > _ALPHA_FLOOR) >= h * 0.85 + ] + + clear = Image.new("RGBA", rgba.size, (0, 0, 0, 0)) + for top, bottom in _thin_groups(wide_rows): + rgba.paste(clear.crop((0, top, w, bottom)), (0, top)) + for left, right in _thin_groups(tall_cols): + rgba.paste(clear.crop((left, 0, right, h)), (left, 0)) + return rgba + + +def _component_boxes(image) -> list[tuple[tuple[int, int, int, int], int]]: + """Connected opaque components as ``[(bbox, mass)]``. + + A full ML segmenter would be overkill here: after chroma keying, "the pet" is + the dominant connected alpha component inside each known slot. Tiny detached + sparkles, tears, UI dots, and neighbour slivers are separate components. + """ + from collections import deque + + rgba = image.convert("RGBA") + bbox = rgba.getbbox() + if bbox is None: + return [] + l0, t0, r0, b0 = bbox + w, h = r0 - l0, b0 - t0 + alpha = rgba.getchannel("A").load() + visited = bytearray(w * h) + out: list[tuple[tuple[int, int, int, int], int]] = [] + + for start in range(w * h): + if visited[start]: + continue + sx, sy = start % w, start // w + ax, ay = l0 + sx, t0 + sy + visited[start] = 1 + if alpha[ax, ay] <= _ALPHA_FLOOR: + continue + + queue: deque[tuple[int, int]] = deque([(sx, sy)]) + left = right = sx + top = bottom = sy + mass = 0 + while queue: + x, y = queue.popleft() + mass += 1 + left, right = min(left, x), max(right, x) + top, bottom = min(top, y), max(bottom, y) + for nx, ny in ((x + 1, y), (x - 1, y), (x, y + 1), (x, y - 1)): + if 0 <= nx < w and 0 <= ny < h: + idx = ny * w + nx + if not visited[idx]: + visited[idx] = 1 + if alpha[l0 + nx, t0 + ny] > _ALPHA_FLOOR: + queue.append((nx, ny)) + out.append(((l0 + left, t0 + top, l0 + right + 1, t0 + bottom + 1), mass)) + return out + + +def _isolate_slot_subject(image): + """Keep the slot's real subject; drop detached effects/noise.""" + from PIL import Image + + rgba = _erase_long_axis_lines(image) + comps = _component_boxes(rgba) + if not comps: + return rgba + + main_box, main_mass = max(comps, key=lambda item: item[1]) + ml, mt, mr, mb = main_box + mw = max(1, mr - ml) + keep: list[tuple[int, int, int, int]] = [] + for box, mass in comps: + if box == main_box: + keep.append(box) + continue + left, _top, right, _bottom = box + overlap = max(0, min(right, mr) - max(left, ml)) + center_x = (left + right) / 2 + near_main = (ml - mw * 0.25) <= center_x <= (mr + mw * 0.25) + # Keep meaningful attached-looking accessories such as halos; drop + # sparkles/tears/noise that don't overlap the body column. + if mass >= max(24, main_mass * 0.035) and (overlap >= mw * 0.3 or near_main): + keep.append(box) + + out = Image.new("RGBA", rgba.size, (0, 0, 0, 0)) + for box in keep: + out.alpha_composite(rgba.crop(box), (box[0], box[1])) + return out + + +def _has_slot_padding(image) -> bool: + """True when content has empty room on all four slot edges.""" + bbox = image.getbbox() + if bbox is None: + return False + w, h = image.size + left, top, right, bottom = bbox + min_x = max(4, min(12, round(w * 0.025))) + min_y = max(4, min(16, round(h * 0.02))) + return left >= min_x and top >= min_y and w - right >= min_x and h - bottom >= min_y + + +def _slot_bounds(width: int, frame_count: int) -> list[tuple[int, int]]: + return [ + (round(i * width / frame_count), round((i + 1) * width / frame_count)) + for i in range(frame_count) + ] + + +def _group_component_rows(boxes: list[tuple[int, int, int, int]]) -> list[list[tuple[int, int, int, int]]]: + """Group component boxes into visual rows, then sort left→right.""" + if not boxes: + return [] + heights = sorted(max(1, b[3] - b[1]) for b in boxes) + row_tol = max(12, heights[len(heights) // 2] * 0.55) + rows: list[list[tuple[int, int, int, int]]] = [] + centers: list[float] = [] + for box in sorted(boxes, key=lambda b: (b[1] + b[3]) / 2): + cy = (box[1] + box[3]) / 2 + for i, center in enumerate(centers): + if abs(cy - center) <= row_tol: + rows[i].append(box) + centers[i] = sum((b[1] + b[3]) / 2 for b in rows[i]) / len(rows[i]) + break + else: + rows.append([box]) + centers.append(cy) + ordered = [row for _center, row in sorted(zip(centers, rows, strict=False), key=lambda item: item[0])] + for row in ordered: + row.sort(key=lambda b: (b[0] + b[2]) / 2) + return ordered + + +def _merge_related_boxes(boxes: list[tuple[int, int, int, int]]) -> list[tuple[int, int, int, int]]: + """Merge disconnected parts that clearly belong to one subject. + + Capes, tails, horns, and held props sometimes key as separate components. + Merge components on the same visual row when their vertical spans overlap and + the horizontal gap is tiny compared with the component size. Do not bridge the + much larger gaps between separate poses. + """ + boxes = list(boxes) + changed = True + while changed: + changed = False + merged: list[tuple[int, int, int, int]] = [] + used = [False] * len(boxes) + for i, a in enumerate(boxes): + if used[i]: + continue + al, at, ar, ab = a + used[i] = True + for j in range(i + 1, len(boxes)): + if used[j]: + continue + bl, bt, br, bb = boxes[j] + v_overlap = max(0, min(ab, bb) - max(at, bt)) + min_h = max(1, min(ab - at, bb - bt)) + gap = max(0, max(al, bl) - min(ar, br)) + min_w = max(1, min(ar - al, br - bl)) + if v_overlap >= min_h * 0.45 and gap <= max(14, min_w * 0.22): + al, at, ar, ab = min(al, bl), min(at, bt), max(ar, br), max(ab, bb) + used[j] = True + changed = True + merged.append((al, at, ar, ab)) + boxes = merged + return boxes + + +def _component_crops(strip, frame_count: int, *, require_padding: bool = False) -> list | None: + """Extract frame subjects as connected non-background objects. + + This is the robust path for models that ignore "one horizontal row" and emit a + 2D sprite grid. We count real opaque subject components, discard tiny + detached effects, sort in reading order, and return exactly *frame_count* + frames. Slot slicing is only a fallback when object detection can't satisfy + the contract. + """ + from PIL import Image + + def attempt(source) -> list | None: + comps = _component_boxes(source) + if not comps: + return None + + max_mass = max(m for _box, m in comps) + subjects = _merge_related_boxes([box for box, mass in comps if mass >= max(64, max_mass * 0.12)]) + if len(subjects) < frame_count: + return None + + rows = _group_component_rows(subjects) + ordered = [box for row in rows for box in row][:frame_count] + if len(ordered) < frame_count: + return None + + if require_padding: + min_x = max(4, min(12, round(source.width * 0.01))) + min_y = max(4, min(16, round(source.height * 0.015))) + for left, top, right, bottom in ordered: + if left < min_x or top < min_y or source.width - right < min_x or source.height - bottom < min_y: + return None + + multirow = len(rows) > 1 + frames = [] + for left, top, right, bottom in ordered: + pad_x = max(8, round((right - left) * 0.08)) + pad_y = max(8, round((bottom - top) * 0.08)) + if multirow: + crop_box = ( + max(0, left - pad_x), + max(0, top - pad_y), + min(source.width, right + pad_x), + min(source.height, bottom + pad_y), + ) + elif frame_count == 1: + crop_box = (0, 0, source.width, source.height) + else: + # Preserve vertical motion for true one-row strips (jumping, + # bobbing) while still narrowing X around the object. + crop_box = (max(0, left - pad_x), 0, min(source.width, right + pad_x), source.height) + frame = Image.new("RGBA", (crop_box[2] - crop_box[0], crop_box[3] - crop_box[1]), (0, 0, 0, 0)) + rel = (left - crop_box[0], top - crop_box[1], right - crop_box[0], bottom - crop_box[1]) + frame.alpha_composite(source.crop((left, top, right, bottom)), (rel[0], rel[1])) + # The global component pass already chose the subject box. Do not run + # another component filter here: capes/tails can be legitimate + # disconnected lobes inside the chosen subject box. + frames.append(frame) + return frames + + return attempt(strip) or attempt(_erase_long_axis_lines(strip)) + + def _sever_expected_gutters(strip, frame_count: int): """Cut thin vertical gutters at expected frame boundaries before labeling. @@ -397,17 +687,23 @@ def _sever_expected_gutters(strip, frame_count: int): return out -def _slot_crops(strip, frame_count: int) -> list: +def _slot_crops(strip, frame_count: int, *, require_padding: bool = False) -> list | None: """Slice *strip* into *frame_count* uniform columns (one coordinate space). Equal-width columns keep every frame in a single shared coordinate frame, so a later union-crop + shared placement (:func:`normalize_cells`) preserves the row's real motion without the per-frame re-centering that makes a pet visibly - slide. Neighbour side-bleed is trimmed per column. + slide. Each slot is cleaned independently so detached effects, floors, + dividers, and neighbour slivers do not become "frames". """ - w0 = max(1, strip.width // frame_count) h = strip.height - return [_drop_side_bleed(strip.crop((i * w0, 0, i * w0 + w0, h))) for i in range(frame_count)] + frames = [] + for left, right in _slot_bounds(strip.width, frame_count): + slot = _drop_side_bleed(_isolate_slot_subject(strip.crop((left, 0, right, h)))) + if require_padding and not _has_slot_padding(slot): + return None + frames.append(slot) + return frames def _content_runs(profile: list[int], *, threshold: int = 2) -> list[tuple[int, int]]: @@ -465,6 +761,52 @@ def _frame_x_ranges(strip, frame_count: int) -> list[tuple[int, int]] | None: return [(l, r) for l, r in groups] +def _significant_subject_boxes(image) -> list[tuple[int, int, int, int]]: + comps = _component_boxes(image) + if not comps: + return [] + max_mass = max(mass for _box, mass in comps) + return _merge_related_boxes([box for box, mass in comps if mass >= max(32, max_mass * 0.12)]) + + +def _validate_extracted_frames(frames: list, frame_count: int) -> None: + """Reject rows where one "frame" is really multiple poses. + + A bad provider roll can collapse a strip into tiny repeated poses. If we let + that through, normalization sees a huge motion envelope and shrinks the + entire pet to postage-stamp size. Catch the row here so hatch can regenerate + it instead of saving a technically non-empty but visually broken atlas. + """ + if len(frames) != frame_count: + raise ValueError(f"expected {frame_count} frames, got {len(frames)}") + + boxes = [] + for i, frame in enumerate(frames): + bbox = frame.getbbox() + if bbox is None: + raise ValueError(f"frame {i} is empty") + subjects = _significant_subject_boxes(frame) + if len(subjects) >= 3: + raise ValueError(f"frame {i} contains multiple separated subjects") + boxes.append(bbox) + + if frame_count <= 1: + return + + widths = sorted(b[2] - b[0] for b in boxes) + heights = sorted(b[3] - b[1] for b in boxes) + med_w = max(1, widths[len(widths) // 2]) + med_h = max(1, heights[len(heights) // 2]) + for i, (left, top, right, bottom) in enumerate(boxes): + width = right - left + height = bottom - top + # A legitimate wing/arm can be wider than the median pose. A frame that is + # several times wider while not proportionally taller is usually multiple + # mini-poses packed into one accepted frame. + if width > max(med_w * 3.0, med_w + 96) and height <= med_h * 1.6: + raise ValueError(f"frame {i} is a multi-pose width outlier") + + def extract_strip_frames( strip, frame_count: int, @@ -475,15 +817,15 @@ def extract_strip_frames( ) -> list: """Turn one generated row strip into *frame_count* frames. - The background is keyed out, thin connecting bridges at the expected - boundaries are severed, then the strip is sliced at its empty chroma gutters - (:func:`_frame_x_ranges`) — the plain "find each object, make a frame" cut - that works once poses are spaced apart (which generation now enforces). + The background is keyed out, then strict extraction treats the requested + frame count as the source of truth: slice known equal slots, isolate the real + subject in each slot, and require empty padding on X and Y. Empty chroma + gutters are only a lenient salvage fallback. Each frame is cropped at full cell height so tall ears / halos are never - clipped; :func:`_drop_side_bleed` trims any faint neighbour sliver. When the - poses are touching (fewer gutters than frames) ``components`` raises and - ``auto`` falls back to equal-width slots. + clipped; detached effects and neighbour slivers are dropped per slot. When a + pose does not have required space around it, ``components`` raises and + ``auto`` falls back to best-effort slicing. *fit* (default) fits+centers each frame into a 192x208 cell — the standalone contract for callers that don't normalize. Hatching passes ``fit=False`` to @@ -500,28 +842,38 @@ def extract_strip_frames( strip = remove_background(strip, chroma_key=chroma_key) - # Prefer the real gutters as-is: when poses are already spaced (generation - # enforces this), slicing the strip untouched keeps each pose's own bounds and - # never cuts through an unevenly-placed silhouette. Only fall back to severing - # the expected boundaries when gaps alone can't separate the row — i.e. poses - # are bridged by a shared shadow/glow/1px line and read as one blob. - source = strip - ranges = _frame_x_ranges(source, frame_count) - if ranges is None: - source = _sever_expected_gutters(strip, frame_count) - ranges = _frame_x_ranges(source, frame_count) - - if ranges is None: + # Strict path: count actual non-background subjects first. This handles both + # the intended one-row strip and model-cheated 2D grids without ever stacking + # two visual rows into one frame. + frames = _component_crops(strip, frame_count, require_padding=True) + if frames is None: + frames = _slot_crops(strip, frame_count, require_padding=True) + if frames is None: if method == "components": - raise ValueError(f"could not segment {frame_count} sprites from strip") - frames = _slot_crops(source, frame_count) - else: - h = source.height - pad = max(2, min(16, round((source.width / max(1, frame_count)) * 0.04))) - frames = [ - _drop_side_bleed(source.crop((max(0, left - pad), 0, min(source.width, right + pad), h))) - for left, right in ranges - ] + raise ValueError(f"could not segment {frame_count} padded sprites from strip") + + # Lenient salvage for the final attempt: prefer real gutters when they + # exist, then sever expected boundaries, then fall back to raw slots. Still + # try object extraction first, just without edge-padding enforcement, so + # cached/borderline model rolls can be inspected without stacking a 2D grid. + frames = _component_crops(strip, frame_count, require_padding=False) + if frames is None: + source = strip + ranges = _frame_x_ranges(source, frame_count) + if ranges is None: + source = _sever_expected_gutters(strip, frame_count) + ranges = _frame_x_ranges(source, frame_count) + + if ranges is None: + frames = _slot_crops(source, frame_count, require_padding=False) or [] + else: + h = source.height + pad = max(2, min(16, round((source.width / max(1, frame_count)) * 0.04))) + frames = [ + _drop_side_bleed(_isolate_slot_subject(source.crop((max(0, left - pad), 0, min(source.width, right + pad), h)))) + for left, right in ranges + ] + _validate_extracted_frames(frames, frame_count) return [_fit_to_cell(f) for f in frames] if fit else frames @@ -746,22 +1098,72 @@ def validate_atlas(atlas) -> dict: return {"ok": False, "width": atlas.width, "height": atlas.height, "errors": errors, "warnings": warnings, "filled_states": []} filled_states: list[str] = [] + cell_boxes_by_state: dict[str, list[tuple[int, int, int, int]]] = {} for state, row, count in ROW_SPECS: row_pixels = 0 + boxes: list[tuple[int, int, int, int]] = [] for col in range(count): left = col * CELL_WIDTH top = row * CELL_HEIGHT cell = atlas.crop((left, top, left + CELL_WIDTH, top + CELL_HEIGHT)) nonblank = sum(cell.getchannel("A").histogram()[1:]) row_pixels += nonblank + bbox = cell.getbbox() + if bbox is not None: + boxes.append(bbox) if row_pixels > 0: filled_states.append(state) + cell_boxes_by_state[state] = boxes else: warnings.append(f"state '{state}' has no frames") if not filled_states: errors.append("atlas is empty — no state produced any frames") + # A visually valid pet must occupy the cell. A single bad row can otherwise + # poison global normalization and shrink every state to a tiny postage stamp + # while still passing the old "non-empty cells" check. + all_widths = sorted( + right - left + for boxes in cell_boxes_by_state.values() + for left, _top, right, _bottom in boxes + ) + all_heights = sorted( + bottom - top + for boxes in cell_boxes_by_state.values() + for _left, top, _right, bottom in boxes + ) + global_med_w = 0 + global_med_h = 0 + if all_widths and all_heights: + global_med_w = all_widths[len(all_widths) // 2] + median_h = all_heights[len(all_heights) // 2] + global_med_h = median_h + min_h = max(56, round(CELL_HEIGHT * 0.28)) + if median_h < min_h: + errors.append(f"atlas sprites are too small after normalization (median frame height {median_h}px)") + + for state, boxes in cell_boxes_by_state.items(): + if len(boxes) <= 1: + continue + widths = sorted(right - left for left, _top, right, _bottom in boxes) + heights = sorted(bottom - top for _left, top, _right, bottom in boxes) + med_w = max(1, widths[len(widths) // 2]) + med_h = max(1, heights[len(heights) // 2]) + max_w = widths[-1] + max_h = heights[-1] + if max_w > max(med_w * 3.0, med_w + 96) and max_h <= med_h * 1.6: + errors.append(f"state '{state}' contains a multi-pose frame outlier") + # Per-state collapse guard: one malformed row (tiny slivers / chopped + # fragments) should not pass because other rows are healthy. + if global_med_w and global_med_h: + min_state_w = max(32, round(global_med_w * 0.42)) + min_state_h = max(40, round(global_med_h * 0.50)) + if med_w < min_state_w or med_h < min_state_h: + errors.append( + f"state '{state}' appears collapsed (median {med_w}x{med_h}px, global median {global_med_w}x{global_med_h}px)" + ) + # Transparent pixels must carry zero RGB (no halo residue). data = atlas.tobytes() residue = 0 diff --git a/agent/pet/generate/imagegen.py b/agent/pet/generate/imagegen.py index 00390d1ca8b..4f5000fd703 100644 --- a/agent/pet/generate/imagegen.py +++ b/agent/pet/generate/imagegen.py @@ -14,29 +14,40 @@ producing an ungrounded, drifting pet. from __future__ import annotations import logging +import os from dataclasses import dataclass from pathlib import Path logger = logging.getLogger(__name__) -# Providers that can ground generation on a reference image. -# openrouter / nous reach Gemini Flash Image (and friends) over the -# OpenRouter-compatible chat-completions image protocol, which accepts -# reference images for grounding. Nous Portal proxies OpenRouter, so both -# qualify. -_REF_CAPABLE = ("openai", "openai-codex", "krea", "openrouter", "nous") +# Providers that can ground generation on a reference image, in preference order +# (Nous Portal → OpenAI → OpenRouter → …). OpenRouter/Nous run a quality-first +# model chain and may fall back depending on account access and endpoint behavior, +# so fidelity can vary by configured backend + model availability. +_REF_CAPABLE = ("nous", "openai", "openai-codex", "openrouter", "krea") -# Friendly label + one-line speed/quality note per reference-capable provider, -# surfaced in the desktop pet-gen picker so users can trade speed for fidelity. -_PROVIDER_META: dict[str, dict[str, str]] = { - "nous": {"label": "Nous Portal", "note": "Fast, balanced quality"}, - "openrouter": {"label": "OpenRouter", "note": "Fastest — Gemini Flash Image"}, - "openai": {"label": "OpenAI", "note": "Highest fidelity, slower"}, - "openai-codex": {"label": "OpenAI (Codex)", "note": "Highest fidelity, slower"}, - "krea": {"label": "Krea", "note": "Stylized, style-reference grounding"}, +# Friendly display label per reference-capable provider, surfaced in the desktop +# pet-gen picker. +_PROVIDER_LABELS: dict[str, str] = { + "nous": "Nous Portal", + "openrouter": "OpenRouter", + "openai": "OpenAI", + "openai-codex": "OpenAI (Codex)", + "krea": "Krea", } +def _forced_provider_from_env() -> str | None: + """Optional QA override to force a pet-gen backend. + + `HERMES_PET_IMAGE_PROVIDER=` (e.g. `openrouter`) bypasses the normal + active/default provider resolution for pet generation only. Unknown values are + ignored so existing users are unaffected. + """ + forced = os.environ.get("HERMES_PET_IMAGE_PROVIDER", "").strip().lower() + return forced if forced in _REF_CAPABLE else None + + class GenerationError(RuntimeError): """Raised on any image-generation failure (no provider, API error, IO).""" @@ -71,6 +82,14 @@ def resolve_provider(*, require_references: bool = True, prefer: str | None = No _discover() from agent.image_gen_registry import get_active_provider, get_provider + # QA override: force one provider for pet-gen iteration regardless of the + # globally active image_gen backend. + forced = _forced_provider_from_env() + if forced: + chosen = get_provider(forced) + if chosen is not None and chosen.is_available(): + return SpriteProvider(name=forced, provider=chosen, supports_references=True) + # An explicit user pick wins when it's reference-capable and has credentials; # otherwise we ignore it and fall through to the normal resolution. if prefer: @@ -110,10 +129,11 @@ def resolve_provider(*, require_references: bool = True, prefer: str | None = No def list_sprite_providers() -> list[dict]: """The reference-capable providers available to pick for pet generation. - Returns ``[{name, label, note, default}]`` for every ref-capable provider the - user actually has credentials for, marking the one :func:`resolve_provider` - would choose with no explicit preference. Empty when none is configured (the - picker hides itself). Best-effort: discovery hiccups yield an empty list. + Returns ``[{name, label, default}]`` for every ref-capable provider the user + actually has credentials for, in preference order, marking the one + :func:`resolve_provider` would choose with no explicit preference. Empty when + none is configured (the picker hides itself). Best-effort: discovery hiccups + yield an empty list. """ _discover() from agent.image_gen_registry import get_provider @@ -128,12 +148,10 @@ def list_sprite_providers() -> list[dict]: provider = get_provider(name) if provider is None or not provider.is_available(): continue - meta = _PROVIDER_META.get(name, {}) out.append( { "name": name, - "label": meta.get("label", name), - "note": meta.get("note", ""), + "label": _PROVIDER_LABELS.get(name, name), "default": name == default_name, } ) diff --git a/agent/pet/generate/orchestrate.py b/agent/pet/generate/orchestrate.py index f160046ebf9..54a1adf5b07 100644 --- a/agent/pet/generate/orchestrate.py +++ b/agent/pet/generate/orchestrate.py @@ -38,7 +38,7 @@ _MAX_PARALLEL_GENERATIONS = 4 # How many times to (re)generate a single row before accepting a best-effort # slice. Early attempts demand clean per-pose gutters; the last is lenient so a # stubborn row still yields frames instead of dropping out entirely. -_ROW_GEN_ATTEMPTS = 2 +_ROW_GEN_ATTEMPTS = 3 _MIN_FILLED_STATES = 6 _REQUIRED_STATES = frozenset({"idle", "running-right", "waving"}) diff --git a/agent/pet/generate/prompts.py b/agent/pet/generate/prompts.py index 584cf99155d..085f8a05fc6 100644 --- a/agent/pet/generate/prompts.py +++ b/agent/pet/generate/prompts.py @@ -63,12 +63,14 @@ _STYLE_HINTS: dict[str, str] = { } _BACKGROUND = ( - "Center one full-body character on a flat, uniform, high-contrast chroma-key " - "background (prefer pure hot magenta #FF00FF unless that color appears on " - "the character). The background must completely surround the character: one " - "even color with NO gradient, vignette, texture, pattern, scenery, shadow, " - "ground line, frame, or border, so it keys out cleanly. The background color " - "must not appear anywhere on the character itself. No text, no labels." + "Center the character on a SINGLE flat, uniform, high-contrast chroma-key " + "background — pure hot magenta #FF00FF (only if magenta appears on the " + "character, use pure green #00FF00 instead). The background is ONE continuous " + "even color that completely surrounds the character with NO gradient, " + "vignette, texture, pattern, scenery, shadow, ground line, frame, border, " + "panel, comic cell, gutter line, grid, or divider of any kind, so it keys out " + "cleanly. The background color must not appear anywhere on the character. " + "No text, no labels, no speech bubbles, no UI." ) @@ -149,8 +151,12 @@ def build_row_prompt(state: str, frame_count: int, concept: str, *, style: str | f"(same species, face, colors, markings, proportions, and props), " "preserving the same emotional tone/mood (e.g., scary stays scary, cute stays cute), " f"draw a single WIDE horizontal strip of {frame_count} animation frames showing {action}. " - f"LAYOUT: split the wide strip into {frame_count} equal vertical cells, one " - "pose centered in each cell. " + f"LAYOUT: arrange {frame_count} poses in ONE horizontal row at equal spacing, " + "each pose centered in its own imaginary equal region. Draw NO panel borders, " + "NO comic cells, NO boxes, NO vertical divider/gutter lines, NO grid, NO frame " + "outlines between poses — the backdrop is one unbroken flat field behind all of them. " + "Fill the WHOLE strip with the SAME single flat chroma-key color as the attached " + "reference image's background (identical hue in every frame, no per-pose color shifts). " f"SPACING (critical): draw each pose at a consistent, healthy, clearly " f"visible size (roughly {pose_px}px wide on a {_ASSUMED_STRIP_WIDTH}px " f"strip) — do NOT shrink it tiny — but keep its ENTIRE silhouette " @@ -166,8 +172,9 @@ def build_row_prompt(state: str, frame_count: int, concept: str, *, style: str | # so only the action moves — this is what stops the loop sliding/pulsing. "REGISTRATION (critical): the character is the SAME height and SAME width " "in every frame, drawn at the SAME scale, centered over the SAME point, " - "with all feet resting on ONE shared horizontal ground line across the " - "whole strip. Keep the body's center, size, and stance fixed frame to " + "with all feet aligned to the SAME invisible horizontal baseline across the " + "whole strip — this baseline is conceptual ONLY: draw NO ground line, floor, " + "platform, horizon, or contact shadow beneath the feet. Keep the body's center, size, and stance fixed frame to " "frame — ONLY the limbs/features the action needs may move. Capes, cloaks, " "bags, and scarves stay in the SAME place and shape every frame (no " "swinging, flowing, or drifting) unless the action itself requires it. No " diff --git a/apps/desktop/src/app/pet-generate/components/provider-picker.tsx b/apps/desktop/src/app/pet-generate/components/provider-picker.tsx index bd40a30ba31..3279d7758aa 100644 --- a/apps/desktop/src/app/pet-generate/components/provider-picker.tsx +++ b/apps/desktop/src/app/pet-generate/components/provider-picker.tsx @@ -5,8 +5,9 @@ import { Check, ChevronDown } from '@/lib/icons' import { $petGenProvider, $petGenProviders, setPetGenProvider } from '@/store/pet-generate' // Image-backend picker for pet generation — the composer's model-pill pattern: -// a quiet trigger + a dropdown of options, each with a one-line speed/quality -// note. Hidden unless there are 2+ reference-capable backends (nothing to pick). +// a quiet trigger + a dropdown of options. No per-option notes: every backend +// resolves to the same faithful OpenAI image model, so there's no tradeoff to +// describe. Hidden unless there are 2+ reference-capable backends (nothing to pick). export function ProviderPicker() { const providers = useStore($petGenProviders) const picked = useStore($petGenProvider) @@ -32,19 +33,16 @@ export function ProviderPicker() { {/* The picker lives inside the pet-gen Dialog (z-130) and portals to body, so lift its menu above the dialog or it opens behind it. */} - + {providers.map(provider => ( setPetGenProvider(provider.default ? '' : provider.name)} > - - {provider.label} - {provider.name === current?.name && } - - {provider.note && {provider.note}} + {provider.label} + {provider.name === current?.name && } ))} diff --git a/apps/desktop/src/app/pet-generate/pet-generate-overlay.tsx b/apps/desktop/src/app/pet-generate/pet-generate-overlay.tsx index cd262e142c6..33bd3350f02 100644 --- a/apps/desktop/src/app/pet-generate/pet-generate-overlay.tsx +++ b/apps/desktop/src/app/pet-generate/pet-generate-overlay.tsx @@ -62,10 +62,11 @@ export function PetGenerateOverlay() { // The footer banner narrates the dialog's async state: the failure reason on a // dead-end error, else the "you can close this, we'll notify you" reassurance - // while a generate/hatch runs in the background. + // while a generate/hatch runs in the background. On step 1, show a neutral ETA. const working = status === 'generating' || status === 'hatching' const errored = status === 'error' && drafts.length === 0 - const banner = errored ? error || copy.genericError : working ? copy.backgroundHint : undefined + const stepOne = status === 'idle' || status === 'ready' + const banner = errored ? error || copy.genericError : working ? copy.backgroundHint : stepOne ? copy.slowProviderHint : undefined return ( diff --git a/apps/desktop/src/i18n/en.ts b/apps/desktop/src/i18n/en.ts index 195d6426e4f..cb46f199e4f 100644 --- a/apps/desktop/src/i18n/en.ts +++ b/apps/desktop/src/i18n/en.ts @@ -798,6 +798,7 @@ export const en: Translations = { namePlaceholder: 'Name your pet', staleBackend: 'Update Hermes to generate pets.', backgroundHint: 'You can close this — Hermes will notify you when it’s done.', + slowProviderHint: 'This can take up to 5 minutes', genericError: 'Generation failed — try again or pick a suggestion.', referenceImageTooLarge: 'Reference image is too large. Use one under 16 MB.', referenceImageInvalid: 'Could not read that reference image. Try a PNG, JPG, WebP, or GIF.', @@ -1845,7 +1846,8 @@ export const en: Translations = { restoreCheckpoint: 'Restore checkpoint', restoreFromHere: 'Restore checkpoint — rerun from this prompt', restoreTitle: 'Restore to this checkpoint?', - restoreBody: 'Everything after this prompt is removed from the conversation, and the prompt runs again from here.', + restoreBody: + 'Everything after this prompt is removed from the conversation, and the prompt runs again from here.', restoreConfirm: 'Restore & rerun', restoreNext: 'Restore next checkpoint', goForward: 'Go forward', @@ -1944,7 +1946,8 @@ export const en: Translations = { editFailed: 'Edit failed', resumeFailed: 'Resume failed', resumeStrandedTitle: "Couldn't load this session", - resumeStrandedBody: 'The connection to this session failed and automatic retries gave up. Check that the gateway is running, then try again.', + resumeStrandedBody: + 'The connection to this session failed and automatic retries gave up. Check that the gateway is running, then try again.', resumeRetry: 'Retry', nothingToBranch: 'Nothing to branch', branchNeedsChat: 'Start or resume a chat before branching.', diff --git a/apps/desktop/src/i18n/ja.ts b/apps/desktop/src/i18n/ja.ts index 70e93c65c10..1115972f420 100644 --- a/apps/desktop/src/i18n/ja.ts +++ b/apps/desktop/src/i18n/ja.ts @@ -917,6 +917,7 @@ export const ja = defineLocale({ namePlaceholder: 'ペットに名前を付ける', staleBackend: 'ペットを生成するには Hermes を更新してください。', backgroundHint: 'このウィンドウは閉じても大丈夫です。完了したら Hermes が通知します。', + slowProviderHint: 'これには最大5分かかることがあります。', genericError: '生成に失敗しました。もう一度試すか、候補を選んでください。', referenceImageTooLarge: '参照画像が大きすぎます。16 MB 未満の画像を使ってください。', referenceImageInvalid: '参照画像を読み込めませんでした。PNG/JPG/WebP/GIF を試してください。', diff --git a/apps/desktop/src/i18n/types.ts b/apps/desktop/src/i18n/types.ts index 1f0a4d4f2d6..800819669a8 100644 --- a/apps/desktop/src/i18n/types.ts +++ b/apps/desktop/src/i18n/types.ts @@ -672,6 +672,7 @@ export interface Translations { namePlaceholder: string staleBackend: string backgroundHint: string + slowProviderHint: string genericError: string referenceImageTooLarge: string referenceImageInvalid: string diff --git a/apps/desktop/src/i18n/zh-hant.ts b/apps/desktop/src/i18n/zh-hant.ts index f125c463662..ad93470171b 100644 --- a/apps/desktop/src/i18n/zh-hant.ts +++ b/apps/desktop/src/i18n/zh-hant.ts @@ -887,6 +887,7 @@ export const zhHant = defineLocale({ namePlaceholder: '為寵物命名', staleBackend: '請更新 Hermes 以生成寵物。', backgroundHint: '你可以關閉此視窗——完成後 Hermes 會通知你。', + slowProviderHint: '這可能最多需要 5 分鐘。', genericError: '生成失敗——請重試或選一個建議。', referenceImageTooLarge: '參考圖片過大。請使用小於 16 MB 的圖片。', referenceImageInvalid: '無法讀取該參考圖片。請嘗試 PNG、JPG、WebP 或 GIF。', diff --git a/apps/desktop/src/i18n/zh.ts b/apps/desktop/src/i18n/zh.ts index 91543562b12..8868ef5af2d 100644 --- a/apps/desktop/src/i18n/zh.ts +++ b/apps/desktop/src/i18n/zh.ts @@ -985,6 +985,7 @@ export const zh: Translations = { namePlaceholder: '给宠物起个名字', staleBackend: '请更新 Hermes 以生成宠物。', backgroundHint: '你可以关闭此窗口——完成后 Hermes 会通知你。', + slowProviderHint: '这可能最多需要 5 分钟。', genericError: '生成失败——请重试或选择一个建议。', referenceImageTooLarge: '参考图过大。请使用小于 16 MB 的图片。', referenceImageInvalid: '无法读取该参考图。请尝试 PNG、JPG、WebP 或 GIF。', diff --git a/apps/desktop/src/store/native-notifications.test.ts b/apps/desktop/src/store/native-notifications.test.ts index 48650df1217..de0bf876542 100644 --- a/apps/desktop/src/store/native-notifications.test.ts +++ b/apps/desktop/src/store/native-notifications.test.ts @@ -96,6 +96,19 @@ describe('dispatchNativeNotification focus gating', () => { dispatchNativeNotification({ kind: 'approval', sessionId: 'on-screen', title: 'approve' }) expect(notify).not.toHaveBeenCalled() }) + + it('fires a global completion notification while away with no active session (pet gen)', () => { + setActiveSessionId(null) + dispatchNativeNotification({ global: true, kind: 'backgroundDone', title: 'Your pet hatched' }) + expect(notify).toHaveBeenCalledTimes(1) + }) + + it('suppresses a global notification when the window is focused', () => { + setWindowState({ focused: true, hidden: false }) + setActiveSessionId(null) + dispatchNativeNotification({ global: true, kind: 'backgroundDone', title: 'Your pet hatched' }) + expect(notify).not.toHaveBeenCalled() + }) }) describe('dispatchNativeNotification preferences', () => { diff --git a/apps/desktop/src/store/native-notifications.ts b/apps/desktop/src/store/native-notifications.ts index 1c058c803e1..5e659d061b4 100644 --- a/apps/desktop/src/store/native-notifications.ts +++ b/apps/desktop/src/store/native-notifications.ts @@ -113,7 +113,15 @@ function isBackgrounded(): boolean { return typeof document.hasFocus === 'function' && !document.hasFocus() } -function shouldFire(kind: NativeNotificationKind, sessionId?: null | string): boolean { +function shouldFire(kind: NativeNotificationKind, sessionId?: null | string, global = false): boolean { + // Global notifications aren't tied to a chat session (e.g. pet generation, + // which runs from the command center with no active conversation). They fire + // whenever the user is away, with no session-match requirement — otherwise a + // background run started without an open session would be silently dropped. + if (global) { + return isBackgrounded() + } + // Attention kinds break through for an off-screen session even while focused. if (ATTENTION_KINDS.has(kind)) { return isBackgrounded() || (Boolean(sessionId) && sessionId !== $activeSessionId.get()) @@ -134,6 +142,12 @@ export interface NativeNotificationInput { title: string body?: string sessionId?: null | string + /** + * Not tied to a chat session (e.g. pet generation). Fires whenever the user + * is away, bypassing the session-match gate that completion kinds normally + * require. + */ + global?: boolean silent?: boolean actions?: NativeNotificationAction[] } @@ -145,11 +159,11 @@ export function dispatchNativeNotification(input: NativeNotificationInput): void return } - if (!shouldFire(input.kind, input.sessionId)) { + if (!shouldFire(input.kind, input.sessionId, input.global)) { return } - if (throttled(`${input.kind}:${input.sessionId ?? ''}`, Date.now())) { + if (throttled(`${input.kind}:${input.sessionId ?? (input.global ? 'global' : '')}`, Date.now())) { return } diff --git a/apps/desktop/src/store/pet-generate.ts b/apps/desktop/src/store/pet-generate.ts index b47c858bd76..2b7962775aa 100644 --- a/apps/desktop/src/store/pet-generate.ts +++ b/apps/desktop/src/store/pet-generate.ts @@ -6,8 +6,6 @@ import { dispatchNativeNotification } from '@/store/native-notifications' import { notify } from '@/store/notifications' import { type PetInfo } from '@/store/pet' import { applyAdoptedPet, type GatewayRequest } from '@/store/pet-gallery' -import { $activeSessionId } from '@/store/session' - /** * Feature store for the "generate a pet" flow (Cmd-K → Pets → Generate). * @@ -111,8 +109,6 @@ export const $petGenAvailable = atom(null) export interface PetGenProvider { name: string label: string - /** One-line speed/quality tradeoff note. */ - note: string /** Whether this is the backend's default pick (no override needed). */ default: boolean } @@ -227,7 +223,10 @@ function notifyPetGenDone(title: string, message: string, kind: 'error' | 'succe } notify({ kind, title, message, action: { label: 'View', onClick: openPetGenerate } }) - dispatchNativeNotification({ kind: 'backgroundDone', title, body: message, sessionId: $activeSessionId.get() }) + // Pet generation isn't tied to a chat session — mark it global so the OS + // notification fires whenever the user is away, even with no active session + // (the common case: generating from the command center with no conversation). + dispatchNativeNotification({ kind: 'backgroundDone', title, body: message, global: true }) } interface GenerateOptions { diff --git a/plugins/image_gen/openrouter/__init__.py b/plugins/image_gen/openrouter/__init__.py index 53c0a8b73da..5b2b105d040 100644 --- a/plugins/image_gen/openrouter/__init__.py +++ b/plugins/image_gen/openrouter/__init__.py @@ -3,7 +3,7 @@ Both OpenRouter and the Nous Portal inference endpoint speak the same OpenAI-style ``/chat/completions`` image-generation protocol: send ``modalities: ["image", "text"]`` with an image-output model (e.g. -``google/gemini-2.5-flash-image``), pass reference images as ``image_url`` +``google/gemini-3-pro-image``), pass reference images as ``image_url`` content parts for grounding, and read the generated images back from ``choices[0].message.images[].image_url.url`` (a ``data:image/...;base64`` URI). @@ -40,10 +40,17 @@ from agent.image_gen_provider import ( logger = logging.getLogger(__name__) -# Default image-output model. Gemini 2.5 Flash Image ("nano-banana") is GA on -# OpenRouter, accepts reference images for grounding, and honors -# ``image_config.aspect_ratio``. -DEFAULT_MODEL = "google/gemini-2.5-flash-image" +# Quality-first model chain for OpenRouter-compatible endpoints. +# +# Default behavior (no env/config override): try the highest-fidelity OpenAI +# image model first, then fall back to Gemini 3 Pro Image if the OpenAI model +# is access-gated / unavailable / times out on this endpoint. +# +# Explicit override (OPENROUTER_IMAGE_MODEL or image_gen..model): +# use exactly that model (no auto fallback), so power users keep full control. +DEFAULT_MODEL = "openai/gpt-5.4-image-2" +_FALLBACK_MODEL = "google/gemini-3-pro-image" +_DEFAULT_MODEL_CHAIN = (DEFAULT_MODEL, _FALLBACK_MODEL) # Semantic aspect ratio (the image_gen contract) → OpenRouter's image_config # aspect_ratio strings. @@ -121,6 +128,43 @@ def _extract_images(payload: Dict[str, Any]) -> List[str]: return out +def _access_error_hint( + display: str, model_id: str, env_var: str, status: int, err_msg: str +) -> Optional[str]: + """A targeted hint when an access-gated OpenAI image model can't be reached. + + Some OpenAI image models on OpenRouter need account enablement / BYOK, so the + failure isn't a missing key (the key is valid) — the *model* is unreachable. + The generic "check your key" message is misleading there, so we detect that + case and point the user at the real fix. Returns one actionable line, or + ``None`` when this isn't the access-gated case. + """ + if not model_id.startswith("openai/"): + return None + low = (err_msg or "").lower() + gated = status in (402, 403, 404) or any( + s in low for s in ("no endpoints", "no allowed", "not a valid model", "data policy") + ) + if not gated: + return None + return ( + f"{display} can't reach image model '{model_id}' ({status}) — enable OpenAI " + f"image access in your {display} account, or set {env_var}={_FALLBACK_MODEL}." + ) + + +def _dedupe_models(models: list[str]) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for model in models: + m = (model or "").strip() + if not m or m in seen: + continue + seen.add(m) + out.append(m) + return out + + class OpenRouterCompatImageProvider(ImageGenProvider): """Image generation over an OpenRouter-compatible chat-completions endpoint. @@ -180,9 +224,14 @@ class OpenRouterCompatImageProvider(ImageGenProvider): return [ { "id": DEFAULT_MODEL, - "display": "Gemini 2.5 Flash Image (nano-banana)", - "strengths": "Reference-grounded edits; aspect-ratio control", - } + "display": "OpenAI GPT-5.4 Image 2", + "strengths": "Highest fidelity; best prompt adherence; slower on OpenRouter", + }, + { + "id": _FALLBACK_MODEL, + "display": "Gemini 3 Pro Image", + "strengths": "Fast, reliable fallback with good layout adherence", + }, ] def default_model(self) -> Optional[str]: @@ -193,16 +242,24 @@ class OpenRouterCompatImageProvider(ImageGenProvider): def _resolve_model(self) -> str: """Pick the image model: env override → config → :data:`DEFAULT_MODEL`.""" + return self._resolve_model_chain()[0] + + def _resolve_model_chain(self) -> list[str]: + """Ordered model attempts for this request. + + Explicit user/model config means "use this exact model", so no fallback. + Without overrides we run the quality-first default chain. + """ env_override = os.environ.get(self._model_env_var, "").strip() if env_override: - return env_override + return [env_override] cfg = _load_image_gen_config() scoped = cfg.get(self._config_key) if isinstance(cfg.get(self._config_key), dict) else {} if isinstance(scoped, dict): value = scoped.get("model") if isinstance(value, str) and value.strip(): - return value.strip() - return DEFAULT_MODEL + return [value.strip()] + return _dedupe_models(list(_DEFAULT_MODEL_CHAIN)) def generate( self, @@ -237,7 +294,7 @@ class OpenRouterCompatImageProvider(ImageGenProvider): aspect_ratio=aspect_ratio, ) - model_id = self._resolve_model() + model_chain = self._resolve_model_chain() aspect = resolve_aspect_ratio(aspect_ratio) or_aspect = _ASPECT_RATIOS.get(aspect, "1:1") @@ -258,12 +315,6 @@ class OpenRouterCompatImageProvider(ImageGenProvider): if part: content.append({"type": "image_url", "image_url": {"url": part}}) - payload: Dict[str, Any] = { - "model": model_id, - "modalities": ["image", "text"], - "messages": [{"role": "user", "content": content}], - "image_config": {"aspect_ratio": or_aspect}, - } headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", @@ -271,102 +322,145 @@ class OpenRouterCompatImageProvider(ImageGenProvider): "HTTP-Referer": "https://github.com/NousResearch/hermes-agent", "X-Title": "Hermes Agent", } - - try: - response = requests.post( - f"{base_url}/chat/completions", - headers=headers, - json=payload, - timeout=_REQUEST_TIMEOUT, - ) - response.raise_for_status() - except requests.HTTPError as exc: - resp = exc.response - status = resp.status_code if resp is not None else 0 + last_error: Optional[Dict[str, Any]] = None + for i, model_id in enumerate(model_chain): + payload: Dict[str, Any] = { + "model": model_id, + "modalities": ["image", "text"], + "messages": [{"role": "user", "content": content}], + "image_config": {"aspect_ratio": or_aspect}, + } + is_last = i == len(model_chain) - 1 try: - err_msg = resp.json().get("error", {}).get("message", resp.text[:300]) - except Exception: # noqa: BLE001 - err_msg = resp.text[:300] if resp is not None else str(exc) - logger.error("%s image gen failed (%d): %s", self._name, status, err_msg) - return error_response( - error=f"{self._display} image generation failed ({status}): {err_msg}", - error_type="api_error", - provider=self._name, + response = requests.post( + f"{base_url}/chat/completions", + headers=headers, + json=payload, + timeout=_REQUEST_TIMEOUT, + ) + response.raise_for_status() + except requests.HTTPError as exc: + resp = exc.response + status = resp.status_code if resp is not None else 0 + try: + err_msg = resp.json().get("error", {}).get("message", resp.text[:300]) + except Exception: # noqa: BLE001 + err_msg = resp.text[:300] if resp is not None else str(exc) + logger.error("%s image gen failed (%d) on %s: %s", self._name, status, model_id, err_msg) + hint = _access_error_hint(self._display, model_id, self._model_env_var, status, err_msg) + if hint and not is_last: + logger.info( + "%s model %s unavailable; retrying with fallback %s", + self._name, + model_id, + model_chain[i + 1], + ) + continue + last_error = error_response( + error=hint or f"{self._display} image generation failed ({status}): {err_msg}", + error_type="model_access" if hint else "api_error", + provider=self._name, + model=model_id, + prompt=prompt, + aspect_ratio=aspect, + ) + return last_error + except requests.Timeout: + if not is_last: + logger.info( + "%s model %s timed out; retrying with fallback %s", + self._name, + model_id, + model_chain[i + 1], + ) + continue + return error_response( + error=f"{self._display} image generation timed out " + f"({int(_REQUEST_TIMEOUT)}s)", + error_type="timeout", + provider=self._name, + model=model_id, + prompt=prompt, + aspect_ratio=aspect, + ) + except requests.ConnectionError as exc: + return error_response( + error=f"{self._display} connection error: {exc}", + error_type="connection_error", + provider=self._name, + model=model_id, + prompt=prompt, + aspect_ratio=aspect, + ) + + try: + result = response.json() + except Exception as exc: # noqa: BLE001 + return error_response( + error=f"{self._display} returned invalid JSON: {exc}", + error_type="invalid_response", + provider=self._name, + model=model_id, + prompt=prompt, + aspect_ratio=aspect, + ) + + images = _extract_images(result) + if not images: + if not is_last: + logger.info( + "%s model %s returned no image; retrying with fallback %s", + self._name, + model_id, + model_chain[i + 1], + ) + continue + # A response with text but no image usually means the model didn't + # honor image output (wrong model or modalities); surface that. + return error_response( + error=( + f"{self._display} returned no image. Ensure the model " + f"'{model_id}' supports image output." + ), + error_type="empty_response", + provider=self._name, + model=model_id, + prompt=prompt, + aspect_ratio=aspect, + ) + + first = images[0] + try: + if first.startswith("data:"): + b64 = first.split(",", 1)[1] if "," in first else "" + saved_path = save_b64_image(b64, prefix=f"{self._name}_gen") + else: + saved_path = save_url_image(first, prefix=f"{self._name}_gen") + except Exception as exc: # noqa: BLE001 + return error_response( + error=f"Could not save generated image: {exc}", + error_type="io_error", + provider=self._name, + model=model_id, + prompt=prompt, + aspect_ratio=aspect, + ) + + return success_response( + image=str(saved_path), model=model_id, prompt=prompt, aspect_ratio=aspect, - ) - except requests.Timeout: - return error_response( - error=f"{self._display} image generation timed out " - f"({int(_REQUEST_TIMEOUT)}s)", - error_type="timeout", provider=self._name, - model=model_id, - prompt=prompt, - aspect_ratio=aspect, - ) - except requests.ConnectionError as exc: - return error_response( - error=f"{self._display} connection error: {exc}", - error_type="connection_error", - provider=self._name, - model=model_id, - prompt=prompt, - aspect_ratio=aspect, ) - try: - result = response.json() - except Exception as exc: # noqa: BLE001 - return error_response( - error=f"{self._display} returned invalid JSON: {exc}", - error_type="invalid_response", - provider=self._name, - model=model_id, - prompt=prompt, - aspect_ratio=aspect, - ) - - images = _extract_images(result) - if not images: - # A response with text but no image usually means the model didn't - # honor image output (wrong model or modalities); surface that. - return error_response( - error=( - f"{self._display} returned no image. Ensure the model " - f"'{model_id}' supports image output." - ), - error_type="empty_response", - provider=self._name, - model=model_id, - prompt=prompt, - aspect_ratio=aspect, - ) - - first = images[0] - try: - if first.startswith("data:"): - b64 = first.split(",", 1)[1] if "," in first else "" - saved_path = save_b64_image(b64, prefix=f"{self._name}_gen") - else: - saved_path = save_url_image(first, prefix=f"{self._name}_gen") - except Exception as exc: # noqa: BLE001 - return error_response( - error=f"Could not save generated image: {exc}", - error_type="io_error", - provider=self._name, - model=model_id, - prompt=prompt, - aspect_ratio=aspect, - ) - - return success_response( - image=str(saved_path), - model=model_id, + return last_error or error_response( + error=f"{self._display} image generation failed after trying all candidate models.", + error_type="api_error", + provider=self._name, + model=model_chain[-1] if model_chain else "", prompt=prompt, aspect_ratio=aspect, - provider=self._name, ) diff --git a/tests/agent/test_pet_generate.py b/tests/agent/test_pet_generate.py index 82bd3f15de5..800f8fa36e8 100644 --- a/tests/agent/test_pet_generate.py +++ b/tests/agent/test_pet_generate.py @@ -52,6 +52,21 @@ def test_extract_strip_frames_keys_out_solid_background(): assert frames[0].getpixel((0, 0))[3] == 0 +def test_remove_background_defringes_antialiased_edge(): + # The contaminated antialiased ring where sprite meets backdrop survives the + # key (it's a blend, too far from pure magenta). Defringe shaves that 1px ring: + # the keyed silhouette comes back eroded ~1px on every side, core intact. + img = Image.new("RGBA", (200, 200), (255, 0, 255, 255)) + draw = ImageDraw.Draw(img) + draw.rectangle((50, 50, 149, 149), fill=(40, 200, 60, 255)) # 100x100 green + keyed = atlas.remove_background(img) + bbox = keyed.getbbox() + assert bbox is not None + w, h = bbox[2] - bbox[0], bbox[3] - bbox[1] + assert 96 <= w <= 99 and 96 <= h <= 99 # ~1px shaved per side + assert keyed.getpixel((100, 100))[3] > 0 # core intact + + def test_remove_background_clears_trapped_chroma_pocket(): # Green body enclosing a magenta pocket (the "pink between the arm" case): # the pocket isn't border-reachable, so it must be cleared by interior seeding. @@ -106,6 +121,47 @@ def test_extract_strip_frames_drops_small_side_lobes_from_adjacent_frames(): assert right_edge_mass == 0 +def test_extract_strip_frames_drops_detached_slot_effects(): + img = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0)) + draw = ImageDraw.Draw(img) + draw.ellipse((72, 54, 148, 172), fill=(70, 190, 70, 255)) # subject + draw.polygon([(10, 76), (16, 84), (24, 78), (18, 88)], fill=(255, 255, 160, 255)) # sparkle + + frame = atlas.extract_strip_frames(img, 1, method="components", fit=False)[0] + bbox = frame.getbbox() + assert bbox is not None + assert bbox[0] > 40 # detached sparkle was removed + + +def test_extract_strip_frames_requires_slot_padding_in_strict_mode(): + img = Image.new("RGBA", (atlas.CELL_WIDTH * 2, atlas.CELL_HEIGHT), (0, 0, 0, 0)) + draw = ImageDraw.Draw(img) + # Frame 0 touches the top edge; strict mode should reject the row so the + # caller regenerates instead of accepting a clipped pet frame. + draw.rectangle((40, 0, 120, 130), fill=(70, 190, 70, 255)) + draw.rectangle((atlas.CELL_WIDTH + 40, 40, atlas.CELL_WIDTH + 120, 170), fill=(70, 190, 70, 255)) + + with pytest.raises(ValueError): + atlas.extract_strip_frames(img, 2, method="components", fit=False) + + +def test_extract_strip_frames_rejects_multi_pose_frame_outlier(): + frames = [] + for _ in range(3): + frame = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0)) + ImageDraw.Draw(frame).rectangle((82, 120, 108, 178), fill=(220, 240, 255, 255)) + frames.append(frame) + + bad = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0)) + draw = ImageDraw.Draw(bad) + for x in (10, 50, 90, 130, 166): + draw.rectangle((x, 124, x + 12, 172), fill=(220, 240, 255, 255)) + frames.append(bad) + + with pytest.raises(ValueError, match="multiple separated subjects"): + atlas._validate_extracted_frames(frames, 4) + + def test_extract_strip_frames_uses_real_gutters_when_spacing_is_uneven(): # gpt-image often returns a square chroma strip whose poses are separated but # not laid out on exact equal-width slots. Equal slot slicing would include @@ -183,6 +239,35 @@ def test_validate_atlas_rejects_rgb_residue(): assert any("residue" in e for e in result["errors"]) +def test_validate_atlas_rejects_postage_stamp_sprite(): + sheet = Image.new("RGBA", (atlas.ATLAS_WIDTH, atlas.ATLAS_HEIGHT), (0, 0, 0, 0)) + frame = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0)) + ImageDraw.Draw(frame).rectangle((86, 174, 106, 201), fill=(220, 240, 255, 255)) + + for _state, row, count in atlas.ROW_SPECS: + for col in range(count): + sheet.alpha_composite(frame, (col * atlas.CELL_WIDTH, row * atlas.CELL_HEIGHT)) + + result = atlas.validate_atlas(sheet) + + assert not result["ok"] + assert any("too small" in e for e in result["errors"]) + + +def test_validate_atlas_rejects_one_collapsed_state_row(): + frames = _frames_for_all_states() + tiny = Image.new("RGBA", (atlas.CELL_WIDTH, atlas.CELL_HEIGHT), (0, 0, 0, 0)) + draw = ImageDraw.Draw(tiny) + draw.rectangle((90, 150, 106, 199), fill=(220, 240, 255, 255)) + frames["failed"] = [tiny.copy() for _ in range(atlas.FRAME_COUNTS["failed"])] + + sheet = atlas.compose_atlas(frames) + result = atlas.validate_atlas(sheet) + + assert not result["ok"] + assert any("appears collapsed" in e and "failed" in e for e in result["errors"]) + + def test_validate_atlas_warns_on_empty_state(): frames = _frames_for_all_states() frames["jumping"] = [] @@ -463,9 +548,12 @@ def test_list_sprite_providers_marks_default(monkeypatch): listed = imagegen.list_sprite_providers() names = {p["name"] for p in listed} assert names == {"openai", "nous"} - # Every entry carries display metadata, and exactly one is the default. - assert all(p["label"] and "note" in p for p in listed) + # Every entry carries a display label (no quality note — all backends are equal). + assert all(p["label"] for p in listed) + assert all("note" not in p for p in listed) assert [p["name"] for p in listed if p["default"]] == ["openai"] + # Listed in preference order: Nous Portal before OpenAI. + assert [p["name"] for p in listed] == ["nous", "openai"] def test_generate_retries_without_transparent_background(monkeypatch, tmp_path): diff --git a/tests/plugins/image_gen/test_openrouter_compat_provider.py b/tests/plugins/image_gen/test_openrouter_compat_provider.py index 77724c82528..654f70078d0 100644 --- a/tests/plugins/image_gen/test_openrouter_compat_provider.py +++ b/tests/plugins/image_gen/test_openrouter_compat_provider.py @@ -99,11 +99,22 @@ class TestProviderClass: with patch("plugins.image_gen.openrouter._load_image_gen_config", return_value={}): assert _openrouter().default_model() == DEFAULT_MODEL - assert DEFAULT_MODEL == "google/gemini-2.5-flash-image" + # Default must be an image-output model id (provider/model form). + assert "/" in DEFAULT_MODEL and "image" in DEFAULT_MODEL + + def test_default_chain_prefers_quality_then_fallback(self): + from plugins.image_gen.openrouter import _FALLBACK_MODEL, _DEFAULT_MODEL_CHAIN + + with patch("plugins.image_gen.openrouter._load_image_gen_config", return_value={}): + chain = _openrouter()._resolve_model_chain() + assert chain == list(_DEFAULT_MODEL_CHAIN) + assert chain[0].startswith("openai/") + assert chain[-1] == _FALLBACK_MODEL def test_model_env_override(self, monkeypatch): monkeypatch.setenv("OPENROUTER_IMAGE_MODEL", "black-forest-labs/flux.2-pro") assert _openrouter()._resolve_model() == "black-forest-labs/flux.2-pro" + assert _openrouter()._resolve_model_chain() == ["black-forest-labs/flux.2-pro"] def test_model_config_override(self): cfg = {"openrouter": {"model": "google/gemini-3.1-flash-image-preview"}} @@ -153,6 +164,30 @@ class TestHelpers: assert _extract_images({"choices": [{"message": {"content": "no image"}}]}) == [] + def test_access_error_hint_for_gated_openai_model(self): + from plugins.image_gen.openrouter import _FALLBACK_MODEL, _access_error_hint + + hint = _access_error_hint( + "OpenRouter", "openai/gpt-5.4-image-2", "OPENROUTER_IMAGE_MODEL", 404, "No endpoints found" + ) + assert hint is not None + assert "openai/gpt-5.4-image-2" in hint + assert "OPENROUTER_IMAGE_MODEL" in hint + assert _FALLBACK_MODEL in hint + # Stays a single line under the humanizer's 200-char truncation. + assert "\n" not in hint and len(hint) <= 200 + + def test_access_error_hint_ignores_non_openai_models(self): + from plugins.image_gen.openrouter import _access_error_hint + + assert _access_error_hint("OpenRouter", "google/gemini-3-pro-image", "X", 404, "boom") is None + + def test_access_error_hint_ignores_unrelated_errors(self): + from plugins.image_gen.openrouter import _access_error_hint + + # A 200-class transient with an openai model but no access signal → no hint. + assert _access_error_hint("OpenRouter", "openai/gpt-5.4-image-2", "X", 500, "server error") is None + # --------------------------------------------------------------------------- # generate() @@ -260,10 +295,11 @@ class TestGenerate: resp.raise_for_status.side_effect = req_lib.HTTPError(response=resp) with patch(_RUNTIME, return_value=_runtime_ok()), \ - patch("requests.post", return_value=resp): + patch("requests.post", return_value=resp) as mock_post: result = _openrouter().generate(prompt="a pet") assert result["success"] is False assert result["error_type"] == "api_error" + assert mock_post.call_count == 1 def test_timeout(self): import requests as req_lib @@ -274,6 +310,55 @@ class TestGenerate: assert result["success"] is False assert result["error_type"] == "timeout" + def test_access_gated_model_surfaces_hint(self, monkeypatch): + """A 404 on an OpenAI image model yields the actionable access hint (not + the misleading generic 'check your key' message).""" + import requests as req_lib + + monkeypatch.setenv("OPENROUTER_IMAGE_MODEL", "openai/gpt-5.4-image-2") + resp = MagicMock() + resp.status_code = 404 + resp.text = "No endpoints found for openai/gpt-5.4-image-2" + resp.json.return_value = {"error": {"message": "No endpoints found"}} + resp.raise_for_status.side_effect = req_lib.HTTPError(response=resp) + + with patch(_RUNTIME, return_value=_runtime_ok()), \ + patch("requests.post", return_value=resp) as mock_post: + result = _openrouter().generate(prompt="a pet") + + assert result["success"] is False + assert result["error_type"] == "model_access" + assert "OpenAI image access" in result["error"] + assert mock_post.call_count == 1 # explicit override: no auto-fallback chain + + def test_access_gated_default_model_falls_back_to_gemini(self): + import requests as req_lib + + from plugins.image_gen.openrouter import DEFAULT_MODEL, _FALLBACK_MODEL + + gated = MagicMock() + gated.status_code = 404 + gated.text = f"No endpoints found for {DEFAULT_MODEL}" + gated.json.return_value = {"error": {"message": "No endpoints found"}} + gated.raise_for_status.side_effect = req_lib.HTTPError(response=gated) + + with patch(_RUNTIME, return_value=_runtime_ok()), \ + patch("requests.post", side_effect=[gated, _mock_chat_response([_PNG_DATA_URI])]) as mock_post, \ + patch( + "plugins.image_gen.openrouter.save_b64_image", + return_value=Path("/tmp/openrouter_gen_fallback.png"), + ): + result = _openrouter().generate(prompt="a pet") + + assert result["success"] is True + assert result["model"] == _FALLBACK_MODEL + assert result["image"] == "/tmp/openrouter_gen_fallback.png" + assert mock_post.call_count == 2 + first_model = mock_post.call_args_list[0].kwargs["json"]["model"] + second_model = mock_post.call_args_list[1].kwargs["json"]["model"] + assert first_model == DEFAULT_MODEL + assert second_model == _FALLBACK_MODEL + # --------------------------------------------------------------------------- # Registration + pet integration