hermes-agent/agent/pet/generate/atlas.py
Brooklyn Nicholson e92b5c6af8 feat(pets): quality-first OpenRouter model chain + stronger atlas gates + global pet-gen notifications
OpenRouter/Nous image gen now runs a quality-first model chain by default:
attempt the highest-fidelity OpenAI image model first, then fall back to
Gemini 3 Pro Image when it's access-gated/unavailable/times out. An explicit
OPENROUTER_IMAGE_MODEL / config model override pins one model with no fallback.

Atlas validation rejects malformed model output instead of shipping it: adds a
per-state collapse guard (a single sliver/fragment row no longer passes because
other rows are healthy), on top of the existing postage-stamp + multi-pose
checks.

Desktop: pet-gen native notifications are now "global" (not tied to a chat
session), so a background generation started from the command center fires an
OS notification when the user is away even with no active session. Adds a
neutral "This can take up to 5 minutes." banner on step 1, and lets the
provider picker auto-size.

Tests updated/added for the OpenRouter fallback chain, the collapse guard, and
the global notification path.
2026-06-24 23:11:21 -05:00

1183 lines
47 KiB
Python

"""Deterministic spritesheet assembly — generated row strips → Hermes atlas.
Image-generation models are good at *drawing* a row of poses but bad at exact
grid geometry, so the model never owns the atlas layout: it produces one loose
horizontal strip per state, and these deterministic ops slice that strip into
clean, centered, transparent ``192x208`` cells and pack them into the sheet our
renderer reads.
The atlas follows the **petdex/Codex standard**: 8 columns x 9 rows of
``192x208`` cells (``1536x1872``), with the row order + per-row frame counts
from OpenAI's ``hatch-pet`` skill. Our renderer (:mod:`agent.pet.render`) keys
frames as ``rows = states, cols = frames`` via
:data:`agent.pet.constants.CODEX_STATE_ROWS`, and a pet built here is a valid
``petdex submit`` spritesheet. Rows shorter than 8 columns leave the trailing
cells fully transparent.
Note ``running`` is the *working* state (in-place processing), NOT locomotion —
``running-right`` / ``running-left`` are the actual directional walk cycles.
The frame-segmentation, fit-to-cell, and transparency-residue logic is adapted
from OpenAI's ``hatch-pet`` skill (openai/skills, Apache-2.0).
"""
from __future__ import annotations
import io
import logging
import math
from pathlib import Path
from agent.pet.constants import FRAME_H, FRAME_W
logger = logging.getLogger(__name__)
CELL_WIDTH = FRAME_W
CELL_HEIGHT = FRAME_H
# (state, row index, frame count). Order/row indices MUST match
# ``constants.CODEX_STATE_ROWS`` so the renderer crops the right row for each
# driven state, and the per-row frame counts mirror the petdex/Codex
# ``hatch-pet`` ``animation-rows`` spec. The renderer trims trailing blank
# columns, so rows shorter than ``COLUMNS`` (8) just leave the tail transparent.
ROW_SPECS: list[tuple[str, int, int]] = [
("idle", 0, 6),
("running-right", 1, 8),
("running-left", 2, 8),
("waving", 3, 4),
("jumping", 4, 5),
("failed", 5, 8),
("waiting", 6, 6),
("running", 7, 6),
("review", 8, 6),
]
ROWS = len(ROW_SPECS)
COLUMNS = max(count for _, _, count in ROW_SPECS)
ATLAS_WIDTH = COLUMNS * CELL_WIDTH
ATLAS_HEIGHT = ROWS * CELL_HEIGHT
FRAME_COUNTS: dict[str, int] = {state: count for state, _, count in ROW_SPECS}
# Alpha at/below which a pixel is "background" for component detection.
_ALPHA_FLOOR = 16
# Cell padding kept around a fitted sprite so poses never touch the edge.
_CELL_PAD = 10
# Margin for the normalized pass — small, to fill the cell like real petdex pets
# (they sit ~5px from the edges); the width clamp, not the pad, prevents clipping.
_NORMALIZE_PAD = 14
# Side-lobe cutoff for fitted frames. Adjacent-pose bleed usually appears as a
# small separated horizontal lobe beside the real subject; keep sizeable lobes so
# we don't punish a legitimate wide pose.
_SIDE_LOBE_RATIO = 0.18
# ───────────────────────── background removal ─────────────────────────
def _color_distance(r: int, g: int, b: int, key: tuple[int, int, int]) -> float:
return math.sqrt((r - key[0]) ** 2 + (g - key[1]) ** 2 + (b - key[2]) ** 2)
def _has_transparency(image) -> bool:
"""True if the strip already carries a real alpha background."""
extrema = image.getchannel("A").getextrema()
# Min alpha 0 somewhere and a meaningful share of fully-transparent pixels.
if extrema[0] > _ALPHA_FLOOR:
return False
hist = image.getchannel("A").histogram()
transparent = sum(hist[: _ALPHA_FLOOR + 1])
total = image.width * image.height
return transparent > total * 0.05
def _dominant_corner_color(image) -> tuple[int, int, int]:
"""Sample the four corners and return the most common opaque color."""
from collections import Counter
w, h = image.width, image.height
px = image.load()
counter: Counter = Counter()
for x, y in ((0, 0), (w - 1, 0), (0, h - 1), (w - 1, h - 1)):
r, g, b, a = px[x, y]
if a > _ALPHA_FLOOR:
counter[(r, g, b)] += 1
if not counter:
return (0, 255, 0)
return counter.most_common(1)[0][0]
def _near_key_mask(image, key: tuple[int, int, int], tol: int = 48):
"""An ``L`` mask, 255 where a pixel is within *tol* per-channel of *key*.
Tight on purpose: it only marks near-pure backdrop so trapped chroma pockets
seed the flood, while chroma-*tinted* character pixels stay outside it. Built
with channel point-ops (fast C), no per-pixel Python.
"""
from PIL import ImageChops
r, g, b, _a = image.split()
kr, kg, kb = key
return ImageChops.darker(
ImageChops.darker(
r.point(lambda v: 255 if abs(v - kr) <= tol else 0),
g.point(lambda v: 255 if abs(v - kg) <= tol else 0),
),
b.point(lambda v: 255 if abs(v - kb) <= tol else 0),
)
def _defringe(rgba):
"""Shave the 1px antialiased edge ring left after keying.
Chroma keying can't catch the antialiased band where the sprite meets the
backdrop — those pixels are a key/sprite blend, too far from the key to be
removed, so they ring the cutout in magenta/green. Erode the alpha by one
pixel (a 3x3 min filter) to drop that contaminated ring; the sprite's own
thick dark outline keeps the silhouette intact. Built on a C-level filter, no
per-pixel Python.
"""
from PIL import ImageFilter
rgba.putalpha(rgba.getchannel("A").filter(ImageFilter.MinFilter(3)))
return rgba
def remove_background(image, *, chroma_key: tuple[int, int, int] | None = None, threshold: float = 90.0):
"""Return *image* (RGBA) with its flat background keyed out to transparent.
If the strip already has a transparent background we leave it alone; else we
key out *chroma_key* (or the dominant corner color when not given) via a
**border flood-fill**: only background-coloured pixels *connected to an edge*
are removed. A global color match (the old approach) punched holes in the pet
wherever an interior highlight happened to match the backdrop — e.g. a pug's
light belly against a near-white background — which then showed through as the
window behind. Flood-fill keeps those interior pixels because they aren't
reachable from the border without crossing the (non-background) pet.
"""
from collections import deque
from PIL import Image, ImageChops
rgba = image.convert("RGBA")
if _has_transparency(rgba):
return _repair_internal_alpha_holes(rgba)
key = chroma_key or _dominant_corner_color(rgba)
w, h = rgba.width, rgba.height
px = rgba.load()
def _is_bg(x: int, y: int) -> bool:
r, g, b, a = px[x, y]
return a > _ALPHA_FLOOR and _color_distance(r, g, b, key) <= threshold
# Fast path for strongly-saturated chroma keys (our normal sprite prompts use
# hot magenta): remove all near-key opaque pixels with C-level channel ops.
# This clears both border-connected backdrop and enclosed triangular pockets
# between connected limbs/capes, without a Python flood over ~1.5M pixels.
if max(key) - min(key) >= 120:
near = _near_key_mask(rgba, key) # L mask, 255 where near key
opaque = rgba.getchannel("A").point(lambda a: 255 if a > _ALPHA_FLOOR else 0)
remove_mask = ImageChops.darker(near, opaque)
keyed = Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, remove_mask)
return _defringe(keyed)
visited = bytearray(w * h)
# Mark removals in a flat mask and apply them in one C composite at the end —
# writing `px[x, y] = (0,0,0,0)` per pixel was ~3M PixelAccess calls (84% of
# the whole pipeline) and pegged a core in pure Python, stalling the gateway.
remove = bytearray(w * h)
queue: deque[tuple[int, int]] = deque()
# Seed from every border pixel that looks like background.
for x in range(w):
for y in (0, h - 1):
if _is_bg(x, y) and not visited[y * w + x]:
visited[y * w + x] = 1
queue.append((x, y))
for y in range(h):
for x in (0, w - 1):
if _is_bg(x, y) and not visited[y * w + x]:
visited[y * w + x] = 1
queue.append((x, y))
# Trapped pockets: background enclosed by the character (the magenta between
# an arm and the body) isn't border-reachable, so also seed the flood from
# interior near-key pixels. Gated to a *saturated* key (our magenta backdrop)
# so we never seed from a character sharing a desaturated near-white/gray key
# — that's the hole-punching the border-only flood exists to avoid.
if max(key) - min(key) >= 120:
for i, near in enumerate(_near_key_mask(rgba, key).getdata()):
if near and not visited[i]:
visited[i] = 1
queue.append((i % w, i // w))
while queue:
x, y = queue.popleft()
remove[y * w + x] = 1
for nx, ny in ((x + 1, y), (x - 1, y), (x, y + 1), (x, y - 1)):
if 0 <= nx < w and 0 <= ny < h:
idx = ny * w + nx
if not visited[idx]:
visited[idx] = 1
if _is_bg(nx, ny):
queue.append((nx, ny))
# One C-level composite instead of millions of per-pixel writes: paint the
# flooded pixels to (0,0,0,0) wherever the mask is set.
mask = Image.frombytes("L", (w, h), bytes(remove)).point(lambda v: 255 if v else 0)
return _defringe(Image.composite(Image.new("RGBA", rgba.size, (0, 0, 0, 0)), rgba, mask))
def _repair_internal_alpha_holes(image):
"""Fill transparent islands fully enclosed by opaque sprite pixels.
Some providers return "transparent" PNGs with swiss-cheese alpha inside the
character. Border flood-fill cannot see those because there is no opaque
backdrop to key, so repair the alpha mask itself: transparent components that
touch an image edge remain background; transparent components enclosed by
the sprite are filled with the average color of their opaque neighbours.
"""
from collections import deque
rgba = image.convert("RGBA")
w, h = rgba.size
px = rgba.load()
visited = bytearray(w * h)
def _is_transparent(x: int, y: int) -> bool:
return px[x, y][3] <= _ALPHA_FLOOR
def _mark_border_component(sx: int, sy: int) -> None:
queue: deque[tuple[int, int]] = deque([(sx, sy)])
visited[sy * w + sx] = 1
while queue:
x, y = queue.popleft()
for nx, ny in ((x + 1, y), (x - 1, y), (x, y + 1), (x, y - 1)):
if 0 <= nx < w and 0 <= ny < h:
idx = ny * w + nx
if not visited[idx] and _is_transparent(nx, ny):
visited[idx] = 1
queue.append((nx, ny))
# First mark true background: all transparent pixels reachable from the edge.
for x in range(w):
for y in (0, h - 1):
if _is_transparent(x, y) and not visited[y * w + x]:
_mark_border_component(x, y)
for y in range(h):
for x in (0, w - 1):
if _is_transparent(x, y) and not visited[y * w + x]:
_mark_border_component(x, y)
def _collect_hole(sx: int, sy: int) -> list[tuple[int, int]]:
queue: deque[tuple[int, int]] = deque([(sx, sy)])
visited[sy * w + sx] = 1
pixels: list[tuple[int, int]] = []
while queue:
x, y = queue.popleft()
pixels.append((x, y))
for nx, ny in ((x + 1, y), (x - 1, y), (x, y + 1), (x, y - 1)):
if 0 <= nx < w and 0 <= ny < h:
idx = ny * w + nx
if not visited[idx] and _is_transparent(nx, ny):
visited[idx] = 1
queue.append((nx, ny))
return pixels
def _fill_color(hole: list[tuple[int, int]]) -> tuple[int, int, int, int]:
samples: list[tuple[int, int, int]] = []
seen = set(hole)
for x, y in hole:
for nx, ny in ((x + 1, y), (x - 1, y), (x, y + 1), (x, y - 1)):
if 0 <= nx < w and 0 <= ny < h and (nx, ny) not in seen:
r, g, b, a = px[nx, ny]
if a > _ALPHA_FLOOR:
samples.append((r, g, b))
if not samples:
return (0, 0, 0, 255)
return (
round(sum(c[0] for c in samples) / len(samples)),
round(sum(c[1] for c in samples) / len(samples)),
round(sum(c[2] for c in samples) / len(samples)),
255,
)
for start, _ in enumerate(visited):
if visited[start]:
continue
x = start % w
y = start // w
if not _is_transparent(x, y):
continue
hole = _collect_hole(x, y)
color = _fill_color(hole)
for hx, hy in hole:
px[hx, hy] = color
return rgba
# ───────────────────────── frame extraction ─────────────────────────
def _fit_to_cell(image):
"""Crop to content, scale to fit a padded cell, and center on transparent."""
from PIL import Image
target = Image.new("RGBA", (CELL_WIDTH, CELL_HEIGHT), (0, 0, 0, 0))
image = _drop_side_bleed(image)
bbox = image.getbbox()
if bbox is None:
return target
sprite = image.crop(bbox)
max_w = CELL_WIDTH - _CELL_PAD
max_h = CELL_HEIGHT - _CELL_PAD
scale = min(max_w / sprite.width, max_h / sprite.height, 1.0)
if scale != 1.0:
# NEAREST, not LANCZOS: the generated "pixel art" has hard edges, and any
# interpolating resample anti-aliases them into a blurry, washed-out
# sprite once the renderer upscales the cell. Crisp blocky downscale reads
# as real pixel art.
sprite = sprite.resize(
(max(1, round(sprite.width * scale)), max(1, round(sprite.height * scale))),
Image.Resampling.NEAREST,
)
left = (CELL_WIDTH - sprite.width) // 2
top = (CELL_HEIGHT - sprite.height) // 2
target.alpha_composite(sprite, (left, top))
return target
def _drop_side_bleed(image):
"""Remove tiny separated left/right lobes before fitting a frame.
Frogger showed the failure mode: a good centered pose plus a thin vertical
sliver from the neighbouring pose. By the time it reaches a cell, that sliver
may be close enough to the subject that component extraction already grouped
it. A horizontal alpha projection still reveals it as a small side lobe with
a low mass compared to the main silhouette. Drop only those low-mass lobes;
keep large lobes so wide poses and real limbs survive.
"""
from PIL import Image
rgba = image.convert("RGBA")
w, h = rgba.size
profile = _column_profile(rgba) # mean alpha per column (fast C resize)
runs = _content_runs(profile)
if len(runs) < 2:
return rgba
masses = [sum(profile[l:r]) for l, r in runs]
keep_mass = max(masses) * _SIDE_LOBE_RATIO
keep = [run for run, m in zip(runs, masses) if m >= keep_mass]
if len(keep) == len(runs):
return rgba
# Zero every column band that isn't a kept segment (box paste, not per-pixel).
rgba = rgba.copy()
cut, prev = Image.new("RGBA", (w, h), (0, 0, 0, 0)), 0
for left, right in keep:
if left > prev:
rgba.paste(cut.crop((prev, 0, left, h)), (prev, 0))
prev = right
if prev < w:
rgba.paste(cut.crop((prev, 0, w, h)), (prev, 0))
return rgba
def _erase_long_axis_lines(image):
"""Remove thin slot-spanning guide/floor/divider lines.
Gemini will sometimes satisfy "baseline" / "cell" language by drawing
literal horizontal floors or vertical panel dividers. They survive chroma
keying and connect otherwise clean poses. Drop only *thin* rows/columns that
span nearly the whole slot; thick sprite body rows are left alone.
"""
from PIL import Image
rgba = image.convert("RGBA").copy()
w, h = rgba.size
alpha = rgba.getchannel("A")
def _thin_groups(indices: list[int]) -> list[tuple[int, int]]:
groups: list[tuple[int, int]] = []
start: int | None = None
prev: int | None = None
for idx in indices:
if start is None:
start = prev = idx
continue
if prev is not None and idx == prev + 1:
prev = idx
continue
if start is not None and prev is not None and prev - start + 1 <= 4:
groups.append((start, prev + 1))
start = prev = idx
if start is not None and prev is not None and prev - start + 1 <= 4:
groups.append((start, prev + 1))
return groups
wide_rows = [
y
for y in range(h)
if sum(1 for x in range(w) if alpha.getpixel((x, y)) > _ALPHA_FLOOR) >= w * 0.85
]
tall_cols = [
x
for x in range(w)
if sum(1 for y in range(h) if alpha.getpixel((x, y)) > _ALPHA_FLOOR) >= h * 0.85
]
clear = Image.new("RGBA", rgba.size, (0, 0, 0, 0))
for top, bottom in _thin_groups(wide_rows):
rgba.paste(clear.crop((0, top, w, bottom)), (0, top))
for left, right in _thin_groups(tall_cols):
rgba.paste(clear.crop((left, 0, right, h)), (left, 0))
return rgba
def _component_boxes(image) -> list[tuple[tuple[int, int, int, int], int]]:
"""Connected opaque components as ``[(bbox, mass)]``.
A full ML segmenter would be overkill here: after chroma keying, "the pet" is
the dominant connected alpha component inside each known slot. Tiny detached
sparkles, tears, UI dots, and neighbour slivers are separate components.
"""
from collections import deque
rgba = image.convert("RGBA")
bbox = rgba.getbbox()
if bbox is None:
return []
l0, t0, r0, b0 = bbox
w, h = r0 - l0, b0 - t0
alpha = rgba.getchannel("A").load()
visited = bytearray(w * h)
out: list[tuple[tuple[int, int, int, int], int]] = []
for start in range(w * h):
if visited[start]:
continue
sx, sy = start % w, start // w
ax, ay = l0 + sx, t0 + sy
visited[start] = 1
if alpha[ax, ay] <= _ALPHA_FLOOR:
continue
queue: deque[tuple[int, int]] = deque([(sx, sy)])
left = right = sx
top = bottom = sy
mass = 0
while queue:
x, y = queue.popleft()
mass += 1
left, right = min(left, x), max(right, x)
top, bottom = min(top, y), max(bottom, y)
for nx, ny in ((x + 1, y), (x - 1, y), (x, y + 1), (x, y - 1)):
if 0 <= nx < w and 0 <= ny < h:
idx = ny * w + nx
if not visited[idx]:
visited[idx] = 1
if alpha[l0 + nx, t0 + ny] > _ALPHA_FLOOR:
queue.append((nx, ny))
out.append(((l0 + left, t0 + top, l0 + right + 1, t0 + bottom + 1), mass))
return out
def _isolate_slot_subject(image):
"""Keep the slot's real subject; drop detached effects/noise."""
from PIL import Image
rgba = _erase_long_axis_lines(image)
comps = _component_boxes(rgba)
if not comps:
return rgba
main_box, main_mass = max(comps, key=lambda item: item[1])
ml, mt, mr, mb = main_box
mw = max(1, mr - ml)
keep: list[tuple[int, int, int, int]] = []
for box, mass in comps:
if box == main_box:
keep.append(box)
continue
left, _top, right, _bottom = box
overlap = max(0, min(right, mr) - max(left, ml))
center_x = (left + right) / 2
near_main = (ml - mw * 0.25) <= center_x <= (mr + mw * 0.25)
# Keep meaningful attached-looking accessories such as halos; drop
# sparkles/tears/noise that don't overlap the body column.
if mass >= max(24, main_mass * 0.035) and (overlap >= mw * 0.3 or near_main):
keep.append(box)
out = Image.new("RGBA", rgba.size, (0, 0, 0, 0))
for box in keep:
out.alpha_composite(rgba.crop(box), (box[0], box[1]))
return out
def _has_slot_padding(image) -> bool:
"""True when content has empty room on all four slot edges."""
bbox = image.getbbox()
if bbox is None:
return False
w, h = image.size
left, top, right, bottom = bbox
min_x = max(4, min(12, round(w * 0.025)))
min_y = max(4, min(16, round(h * 0.02)))
return left >= min_x and top >= min_y and w - right >= min_x and h - bottom >= min_y
def _slot_bounds(width: int, frame_count: int) -> list[tuple[int, int]]:
return [
(round(i * width / frame_count), round((i + 1) * width / frame_count))
for i in range(frame_count)
]
def _group_component_rows(boxes: list[tuple[int, int, int, int]]) -> list[list[tuple[int, int, int, int]]]:
"""Group component boxes into visual rows, then sort left→right."""
if not boxes:
return []
heights = sorted(max(1, b[3] - b[1]) for b in boxes)
row_tol = max(12, heights[len(heights) // 2] * 0.55)
rows: list[list[tuple[int, int, int, int]]] = []
centers: list[float] = []
for box in sorted(boxes, key=lambda b: (b[1] + b[3]) / 2):
cy = (box[1] + box[3]) / 2
for i, center in enumerate(centers):
if abs(cy - center) <= row_tol:
rows[i].append(box)
centers[i] = sum((b[1] + b[3]) / 2 for b in rows[i]) / len(rows[i])
break
else:
rows.append([box])
centers.append(cy)
ordered = [row for _center, row in sorted(zip(centers, rows, strict=False), key=lambda item: item[0])]
for row in ordered:
row.sort(key=lambda b: (b[0] + b[2]) / 2)
return ordered
def _merge_related_boxes(boxes: list[tuple[int, int, int, int]]) -> list[tuple[int, int, int, int]]:
"""Merge disconnected parts that clearly belong to one subject.
Capes, tails, horns, and held props sometimes key as separate components.
Merge components on the same visual row when their vertical spans overlap and
the horizontal gap is tiny compared with the component size. Do not bridge the
much larger gaps between separate poses.
"""
boxes = list(boxes)
changed = True
while changed:
changed = False
merged: list[tuple[int, int, int, int]] = []
used = [False] * len(boxes)
for i, a in enumerate(boxes):
if used[i]:
continue
al, at, ar, ab = a
used[i] = True
for j in range(i + 1, len(boxes)):
if used[j]:
continue
bl, bt, br, bb = boxes[j]
v_overlap = max(0, min(ab, bb) - max(at, bt))
min_h = max(1, min(ab - at, bb - bt))
gap = max(0, max(al, bl) - min(ar, br))
min_w = max(1, min(ar - al, br - bl))
if v_overlap >= min_h * 0.45 and gap <= max(14, min_w * 0.22):
al, at, ar, ab = min(al, bl), min(at, bt), max(ar, br), max(ab, bb)
used[j] = True
changed = True
merged.append((al, at, ar, ab))
boxes = merged
return boxes
def _component_crops(strip, frame_count: int, *, require_padding: bool = False) -> list | None:
"""Extract frame subjects as connected non-background objects.
This is the robust path for models that ignore "one horizontal row" and emit a
2D sprite grid. We count real opaque subject components, discard tiny
detached effects, sort in reading order, and return exactly *frame_count*
frames. Slot slicing is only a fallback when object detection can't satisfy
the contract.
"""
from PIL import Image
def attempt(source) -> list | None:
comps = _component_boxes(source)
if not comps:
return None
max_mass = max(m for _box, m in comps)
subjects = _merge_related_boxes([box for box, mass in comps if mass >= max(64, max_mass * 0.12)])
if len(subjects) < frame_count:
return None
rows = _group_component_rows(subjects)
ordered = [box for row in rows for box in row][:frame_count]
if len(ordered) < frame_count:
return None
if require_padding:
min_x = max(4, min(12, round(source.width * 0.01)))
min_y = max(4, min(16, round(source.height * 0.015)))
for left, top, right, bottom in ordered:
if left < min_x or top < min_y or source.width - right < min_x or source.height - bottom < min_y:
return None
multirow = len(rows) > 1
frames = []
for left, top, right, bottom in ordered:
pad_x = max(8, round((right - left) * 0.08))
pad_y = max(8, round((bottom - top) * 0.08))
if multirow:
crop_box = (
max(0, left - pad_x),
max(0, top - pad_y),
min(source.width, right + pad_x),
min(source.height, bottom + pad_y),
)
elif frame_count == 1:
crop_box = (0, 0, source.width, source.height)
else:
# Preserve vertical motion for true one-row strips (jumping,
# bobbing) while still narrowing X around the object.
crop_box = (max(0, left - pad_x), 0, min(source.width, right + pad_x), source.height)
frame = Image.new("RGBA", (crop_box[2] - crop_box[0], crop_box[3] - crop_box[1]), (0, 0, 0, 0))
rel = (left - crop_box[0], top - crop_box[1], right - crop_box[0], bottom - crop_box[1])
frame.alpha_composite(source.crop((left, top, right, bottom)), (rel[0], rel[1]))
# The global component pass already chose the subject box. Do not run
# another component filter here: capes/tails can be legitimate
# disconnected lobes inside the chosen subject box.
frames.append(frame)
return frames
return attempt(strip) or attempt(_erase_long_axis_lines(strip))
def _sever_expected_gutters(strip, frame_count: int):
"""Cut thin vertical gutters at expected frame boundaries before labeling.
Generated rows often have a shared shadow, glow, motion smear, or 1px bridge
that connects neighbouring poses. Component detection then sees one giant
blob and either fails or falls back to slot slicing. We know the requested
frame count, so cut a very narrow transparent band at each expected boundary
before connected-component labeling. If a pose truly overlaps the boundary,
losing a few pixels is better than exporting merged frames.
"""
if frame_count <= 1:
return strip
out = strip.copy()
px = out.load()
slot = out.width / frame_count
half = max(3, min(18, round(slot * 0.06)))
for i in range(1, frame_count):
x = round(i * slot)
left = max(0, x - half)
right = min(out.width, x + half + 1)
for gx in range(left, right):
for gy in range(out.height):
r, g, b, _a = px[gx, gy]
px[gx, gy] = (r, g, b, 0)
return out
def _slot_crops(strip, frame_count: int, *, require_padding: bool = False) -> list | None:
"""Slice *strip* into *frame_count* uniform columns (one coordinate space).
Equal-width columns keep every frame in a single shared coordinate frame, so
a later union-crop + shared placement (:func:`normalize_cells`) preserves the
row's real motion without the per-frame re-centering that makes a pet visibly
slide. Each slot is cleaned independently so detached effects, floors,
dividers, and neighbour slivers do not become "frames".
"""
h = strip.height
frames = []
for left, right in _slot_bounds(strip.width, frame_count):
slot = _drop_side_bleed(_isolate_slot_subject(strip.crop((left, 0, right, h))))
if require_padding and not _has_slot_padding(slot):
return None
frames.append(slot)
return frames
def _content_runs(profile: list[int], *, threshold: int = 2) -> list[tuple[int, int]]:
"""Contiguous column spans whose alpha mass exceeds *threshold*.
A column-projection of the alpha mask: empty (background) columns separate
one pose from the next, so the runs ARE the candidate frames.
"""
runs: list[tuple[int, int]] = []
start: int | None = None
for x, v in enumerate(list(profile) + [0]):
if v > threshold:
if start is None:
start = x
elif start is not None:
runs.append((start, x))
start = None
return runs
def _frame_x_ranges(strip, frame_count: int) -> list[tuple[int, int]] | None:
"""Per-frame ``(left, right)`` column ranges from the row's empty gutters.
The standard sprite-sheet slice — once poses are separated by real gaps
(which generation now enforces), splitting is just "find the empty columns":
* spans == frames → one span per frame.
* spans > frames → merge across the smallest gaps. A detached halo/ear sits
a tiny gap from its body, while the inter-pose gutter is the big gap that
survives — so over-segmentation (and any over-eager gutter sever) repairs
itself by collapsing only the small internal gaps.
* spans < frames → poses are touching; not separable by gutters (the caller
raises for ``components`` or falls back to even slots for ``auto``).
Ranges span content only; the caller crops full cell height, so tall ears /
halos are never cut.
"""
profile = _column_profile(strip)
runs = _content_runs(profile)
if not runs:
return None
# Drop trivial specks so stray noise never counts as a pose.
masses = [sum(profile[l:r]) for l, r in runs]
floor = max(masses) * 0.02
runs = [run for run, m in zip(runs, masses) if m >= floor]
if len(runs) < frame_count:
return None
groups = [[l, r] for l, r in runs]
while len(groups) > frame_count:
gi = min(range(len(groups) - 1), key=lambda i: groups[i + 1][0] - groups[i][1])
groups[gi][1] = groups[gi + 1][1]
del groups[gi + 1]
return [(l, r) for l, r in groups]
def _significant_subject_boxes(image) -> list[tuple[int, int, int, int]]:
comps = _component_boxes(image)
if not comps:
return []
max_mass = max(mass for _box, mass in comps)
return _merge_related_boxes([box for box, mass in comps if mass >= max(32, max_mass * 0.12)])
def _validate_extracted_frames(frames: list, frame_count: int) -> None:
"""Reject rows where one "frame" is really multiple poses.
A bad provider roll can collapse a strip into tiny repeated poses. If we let
that through, normalization sees a huge motion envelope and shrinks the
entire pet to postage-stamp size. Catch the row here so hatch can regenerate
it instead of saving a technically non-empty but visually broken atlas.
"""
if len(frames) != frame_count:
raise ValueError(f"expected {frame_count} frames, got {len(frames)}")
boxes = []
for i, frame in enumerate(frames):
bbox = frame.getbbox()
if bbox is None:
raise ValueError(f"frame {i} is empty")
subjects = _significant_subject_boxes(frame)
if len(subjects) >= 3:
raise ValueError(f"frame {i} contains multiple separated subjects")
boxes.append(bbox)
if frame_count <= 1:
return
widths = sorted(b[2] - b[0] for b in boxes)
heights = sorted(b[3] - b[1] for b in boxes)
med_w = max(1, widths[len(widths) // 2])
med_h = max(1, heights[len(heights) // 2])
for i, (left, top, right, bottom) in enumerate(boxes):
width = right - left
height = bottom - top
# A legitimate wing/arm can be wider than the median pose. A frame that is
# several times wider while not proportionally taller is usually multiple
# mini-poses packed into one accepted frame.
if width > max(med_w * 3.0, med_w + 96) and height <= med_h * 1.6:
raise ValueError(f"frame {i} is a multi-pose width outlier")
def extract_strip_frames(
strip,
frame_count: int,
*,
chroma_key: tuple[int, int, int] | None = None,
method: str = "auto",
fit: bool = True,
) -> list:
"""Turn one generated row strip into *frame_count* frames.
The background is keyed out, then strict extraction treats the requested
frame count as the source of truth: slice known equal slots, isolate the real
subject in each slot, and require empty padding on X and Y. Empty chroma
gutters are only a lenient salvage fallback.
Each frame is cropped at full cell height so tall ears / halos are never
clipped; detached effects and neighbour slivers are dropped per slot. When a
pose does not have required space around it, ``components`` raises and
``auto`` falls back to best-effort slicing.
*fit* (default) fits+centers each frame into a 192x208 cell — the standalone
contract for callers that don't normalize. Hatching passes ``fit=False`` to
keep raw, coordinate-aligned columns for :func:`normalize_cells`, which lays
one shared scale + baseline across the whole pet (no slide, no size pulse).
"""
from PIL import Image
if isinstance(strip, (str, Path)):
with Image.open(strip) as opened:
strip = opened.convert("RGBA")
else:
strip = strip.convert("RGBA")
strip = remove_background(strip, chroma_key=chroma_key)
# Strict path: count actual non-background subjects first. This handles both
# the intended one-row strip and model-cheated 2D grids without ever stacking
# two visual rows into one frame.
frames = _component_crops(strip, frame_count, require_padding=True)
if frames is None:
frames = _slot_crops(strip, frame_count, require_padding=True)
if frames is None:
if method == "components":
raise ValueError(f"could not segment {frame_count} padded sprites from strip")
# Lenient salvage for the final attempt: prefer real gutters when they
# exist, then sever expected boundaries, then fall back to raw slots. Still
# try object extraction first, just without edge-padding enforcement, so
# cached/borderline model rolls can be inspected without stacking a 2D grid.
frames = _component_crops(strip, frame_count, require_padding=False)
if frames is None:
source = strip
ranges = _frame_x_ranges(source, frame_count)
if ranges is None:
source = _sever_expected_gutters(strip, frame_count)
ranges = _frame_x_ranges(source, frame_count)
if ranges is None:
frames = _slot_crops(source, frame_count, require_padding=False) or []
else:
h = source.height
pad = max(2, min(16, round((source.width / max(1, frame_count)) * 0.04)))
frames = [
_drop_side_bleed(_isolate_slot_subject(source.crop((max(0, left - pad), 0, min(source.width, right + pad), h))))
for left, right in ranges
]
_validate_extracted_frames(frames, frame_count)
return [_fit_to_cell(f) for f in frames] if fit else frames
def _column_profile(image) -> list[int]:
"""Per-column alpha mass — collapse the frame to a 1px-tall strip (fast in C)."""
from PIL import Image
return list(image.getchannel("A").resize((image.width, 1), Image.BILINEAR).getdata())
def _best_shift(ref: list[int], prof: list[int], window: int) -> int:
"""Integer dx that best aligns *prof* onto *ref* by cross-correlation.
This is 1-D phase correlation: the body is the dominant mass in the column
profile, so the peak overlap locks onto the body and a flipping arm/cape (a
small secondary bump) doesn't move the match. Proven on the jitter case to
cut body drift from ~9px to ~1px where a centroid/bbox anchor cannot.
"""
n = len(ref)
best_score: float | None = None
best = 0
for d in range(-window, window + 1):
score = 0
for x in range(max(0, d), min(n, n + d)):
score += ref[x] * prof[x - d]
if best_score is None or score > best_score:
best_score = score
best = d
return best
def normalize_cells(frames_by_state: dict[str, list], *, pad: int = _NORMALIZE_PAD) -> dict[str, list]:
"""Register every frame into a 192x208 cell — the deterministic anti-jitter math.
A per-frame "crop→scale→center" pipeline jitters because a moving limb/cape
shifts the bbox (or even the centroid) and a per-frame scale pulses the size.
The rigorous fix, matching image-registration practice (phase correlation)
and AI-sprite pipelines (perfectpixel-studio / sprite-gen):
1. **Cross-correlate** each frame's column profile against the per-state
*median* profile to find the integer shift that locks the **body** in
place — robust to limbs/cape because the body dominates the profile.
2. **Union-crop** through one shared state window, then scale every state by a
single global factor keyed to its median pose height, so the character is
the same on-screen size in every row while a jump's lift still fits.
"""
from PIL import Image
blank = lambda: Image.new("RGBA", (CELL_WIDTH, CELL_HEIGHT), (0, 0, 0, 0))
med = lambda vs: sorted(vs)[len(vs) // 2] # robust center; ignores a limb/cape outlier
out: dict[str, list] = {}
prepared: dict[str, tuple[list, tuple[int, int, int, int], tuple[int, int]]] = {}
# Fill the cell — real petdex pets sit ~pad from the edges; the K cap below
# keeps a tall pose (a jump's lift) from clipping.
target_w = CELL_WIDTH - pad
target_h = CELL_HEIGHT - pad
for state, frames in frames_by_state.items():
rgba = [f.convert("RGBA") for f in frames]
if not any(f.getbbox() for f in rgba):
out[state] = [blank() for _ in frames]
continue
# Pad every frame to a common canvas so column profiles are comparable.
w0 = max(f.width for f in rgba)
h0 = max(f.height for f in rgba)
canvas = []
for f in rgba:
if f.size != (w0, h0):
c = Image.new("RGBA", (w0, h0), (0, 0, 0, 0))
c.alpha_composite(f, (0, 0))
f = c
canvas.append(f)
# Register horizontally: shift each frame to lock the body (xcorr).
profiles = [_column_profile(f) for f in canvas]
ref = [sorted(p[x] for p in profiles)[len(profiles) // 2] for x in range(w0)]
window = max(8, w0 // 5)
margin = window
aligned = []
for f, prof in zip(canvas, profiles):
shifted = Image.new("RGBA", (w0 + 2 * margin, h0), (0, 0, 0, 0))
shifted.alpha_composite(f, (margin + _best_shift(ref, prof, window), 0))
aligned.append(shifted)
# Shared window over the registered set; scale is resolved against a
# common apparent-character target below.
boxes = [b for b in (a.getbbox() for a in aligned) if b]
left = min(b[0] for b in boxes)
top = min(b[1] for b in boxes)
right = max(b[2] for b in boxes)
bottom = max(b[3] for b in boxes)
prepared[state] = (
aligned,
(left, top, right, bottom),
(med([b[2] - b[0] for b in boxes]), med([b[3] - b[1] for b in boxes])),
)
if not prepared:
return out
# Uniform apparent size: scale each state by K / pose_h, so a row the model
# drew small renders as big as one it drew large. K is the one global cap that
# keeps the tallest/widest motion envelope (a jump's lift) inside the cell —
# for a still row union ≈ pose so its term ≈ target_h (full fill).
K = target_h
for (_aligned, (left, top, right, bottom), (_pose_w, pose_h)) in prepared.values():
uw, uh = right - left, bottom - top
K = min(K, target_h * pose_h / max(1, uh), target_w * pose_h / max(1, uw))
for state, (aligned, (left, top, right, bottom), (_pose_w, pose_h)) in prepared.items():
uw, uh = right - left, bottom - top
scale = K / max(1, pose_h)
sw, sh = max(1, round(uw * scale)), max(1, round(uh * scale))
px, py = round((CELL_WIDTH - sw) / 2), round((CELL_HEIGHT - pad // 2) - sh)
cells = []
for a in aligned:
crop = a.crop((left, top, right, bottom))
if crop.size != (sw, sh):
# NEAREST keeps the pixel-art edges crisp; LANCZOS blurred them.
crop = crop.resize((sw, sh), Image.Resampling.NEAREST)
cell = blank()
cell.alpha_composite(crop, (px, py))
cells.append(cell)
out[state] = cells
return out
# ───────────────────────── atlas composition ─────────────────────────
def single_frame(image, *, fit: bool = True):
"""One frame from a standalone image (e.g. the base look).
Used as an idle fallback so a pet always renders even if the idle row
generation failed. *fit* yields a finished 192x208 cell; ``fit=False`` yields
the raw keyed sprite for :func:`normalize_cells` to place with the rest.
"""
from PIL import Image
if isinstance(image, (str, Path)):
with Image.open(image) as opened:
image = opened.convert("RGBA")
keyed = remove_background(image)
return _fit_to_cell(keyed) if fit else _drop_side_bleed(keyed)
def _clear_transparent_rgb(image):
"""Zero the RGB of fully-transparent pixels (no colored-halo residue)."""
from PIL import Image
rgba = image.convert("RGBA")
data = bytearray(rgba.tobytes())
for i in range(0, len(data), 4):
if data[i + 3] == 0:
data[i] = data[i + 1] = data[i + 2] = 0
return Image.frombytes("RGBA", rgba.size, bytes(data))
def mirror_frames(frames: list) -> list:
"""Horizontally flip each frame *in place* (RGBA-safe).
Used to derive ``running-left`` from an approved ``running-right`` row. The
flip is per-frame so the leftward loop preserves the rightward loop's frame
order and timing — this is NOT a whole-strip reverse (which would play the
animation backwards), matching the petdex/Codex mirror rule.
"""
from PIL import Image
flip = getattr(Image, "Transpose", Image).FLIP_LEFT_RIGHT
return [frame.convert("RGBA").transpose(flip) for frame in frames]
def compose_atlas(frames_by_state: dict[str, list]):
"""Pack per-state frame lists into the Hermes atlas (RGBA, residue-cleared).
Missing/short states leave their trailing cells transparent; extra frames
beyond a state's spec are dropped.
"""
from PIL import Image
atlas = Image.new("RGBA", (ATLAS_WIDTH, ATLAS_HEIGHT), (0, 0, 0, 0))
for state, row, count in ROW_SPECS:
frames = frames_by_state.get(state) or []
for col, frame in enumerate(frames[:count]):
cell = frame.convert("RGBA")
if cell.size != (CELL_WIDTH, CELL_HEIGHT):
cell = _fit_to_cell(cell)
atlas.alpha_composite(cell, (col * CELL_WIDTH, row * CELL_HEIGHT))
return _clear_transparent_rgb(atlas)
def atlas_to_webp_bytes(atlas) -> bytes:
"""Encode an atlas image to lossless WebP bytes (the on-disk pet format)."""
buf = io.BytesIO()
atlas.save(buf, format="WEBP", lossless=True, quality=100, method=6, exact=True)
return buf.getvalue()
def validate_atlas(atlas) -> dict:
"""Check geometry, per-cell occupancy, and transparency invariants.
Returns ``{ok, width, height, errors, warnings, filled_states}``. Errors are
blockers (wrong size, empty used cell, opaque/dirty transparency); warnings
are soft (a whole state row blank — generation likely dropped a row).
"""
from PIL import Image
if isinstance(atlas, (str, Path)):
with Image.open(atlas) as opened:
atlas = opened.convert("RGBA")
else:
atlas = atlas.convert("RGBA")
errors: list[str] = []
warnings: list[str] = []
if atlas.size != (ATLAS_WIDTH, ATLAS_HEIGHT):
errors.append(f"expected {ATLAS_WIDTH}x{ATLAS_HEIGHT}, got {atlas.width}x{atlas.height}")
return {"ok": False, "width": atlas.width, "height": atlas.height, "errors": errors, "warnings": warnings, "filled_states": []}
filled_states: list[str] = []
cell_boxes_by_state: dict[str, list[tuple[int, int, int, int]]] = {}
for state, row, count in ROW_SPECS:
row_pixels = 0
boxes: list[tuple[int, int, int, int]] = []
for col in range(count):
left = col * CELL_WIDTH
top = row * CELL_HEIGHT
cell = atlas.crop((left, top, left + CELL_WIDTH, top + CELL_HEIGHT))
nonblank = sum(cell.getchannel("A").histogram()[1:])
row_pixels += nonblank
bbox = cell.getbbox()
if bbox is not None:
boxes.append(bbox)
if row_pixels > 0:
filled_states.append(state)
cell_boxes_by_state[state] = boxes
else:
warnings.append(f"state '{state}' has no frames")
if not filled_states:
errors.append("atlas is empty — no state produced any frames")
# A visually valid pet must occupy the cell. A single bad row can otherwise
# poison global normalization and shrink every state to a tiny postage stamp
# while still passing the old "non-empty cells" check.
all_widths = sorted(
right - left
for boxes in cell_boxes_by_state.values()
for left, _top, right, _bottom in boxes
)
all_heights = sorted(
bottom - top
for boxes in cell_boxes_by_state.values()
for _left, top, _right, bottom in boxes
)
global_med_w = 0
global_med_h = 0
if all_widths and all_heights:
global_med_w = all_widths[len(all_widths) // 2]
median_h = all_heights[len(all_heights) // 2]
global_med_h = median_h
min_h = max(56, round(CELL_HEIGHT * 0.28))
if median_h < min_h:
errors.append(f"atlas sprites are too small after normalization (median frame height {median_h}px)")
for state, boxes in cell_boxes_by_state.items():
if len(boxes) <= 1:
continue
widths = sorted(right - left for left, _top, right, _bottom in boxes)
heights = sorted(bottom - top for _left, top, _right, bottom in boxes)
med_w = max(1, widths[len(widths) // 2])
med_h = max(1, heights[len(heights) // 2])
max_w = widths[-1]
max_h = heights[-1]
if max_w > max(med_w * 3.0, med_w + 96) and max_h <= med_h * 1.6:
errors.append(f"state '{state}' contains a multi-pose frame outlier")
# Per-state collapse guard: one malformed row (tiny slivers / chopped
# fragments) should not pass because other rows are healthy.
if global_med_w and global_med_h:
min_state_w = max(32, round(global_med_w * 0.42))
min_state_h = max(40, round(global_med_h * 0.50))
if med_w < min_state_w or med_h < min_state_h:
errors.append(
f"state '{state}' appears collapsed (median {med_w}x{med_h}px, global median {global_med_w}x{global_med_h}px)"
)
# Transparent pixels must carry zero RGB (no halo residue).
data = atlas.tobytes()
residue = 0
for i in range(0, len(data), 4):
if data[i + 3] == 0 and (data[i] or data[i + 1] or data[i + 2]):
residue += 1
if residue:
errors.append(f"{residue} transparent pixels retain RGB residue")
return {
"ok": not errors,
"width": atlas.width,
"height": atlas.height,
"errors": errors,
"warnings": warnings,
"filled_states": filled_states,
}