retrotoon-studio/server/_core/imageGeneration.ts

import { storagePut } from "server/storage";

export type GenerateImageOptions = {
  prompt: string;
  originalImages?: Array<{
    url?: string;
    b64Json?: string;
    mimeType?: string;
  }>;
  /** Mask URL for localized editing (white=edit zone, black=preserve) */
  maskUrl?: string;
  /** Target aspect ratio "W:H" (e.g. "16:9") - used to pick best size and crop output */
  targetAspectRatio?: string;
  /** Explicit target dimensions for final crop (overrides aspectRatio) */
  targetWidth?: number;
  targetHeight?: number;
};

export type GenerateImageResponse = {
  url?: string;
  provider?: "gemini" | "openai";
  width?: number;
  height?: number;
};

/**
 * Parse aspect ratio string "W:H" to number
 */
function parseAspectRatio(ratio?: string): number | null {
  if (!ratio) return null;
  const m = ratio.match(/^(\d+(?:\.\d+)?):(\d+(?:\.\d+)?)$/);
  if (!m) return null;
  const w = parseFloat(m[1]);
  const h = parseFloat(m[2]);
  if (h === 0) return null;
  return w / h;
}

/**
 * Pick the best OpenAI gpt-image-1 size for a target aspect ratio.
 * Available: 1024x1024 (1.0), 1536x1024 (1.5), 1024x1536 (0.667)
 */
function pickOpenAISize(aspectRatio: number): "1024x1024" | "1536x1024" | "1024x1536" {
  const options = [
    { size: "1024x1024" as const, ratio: 1.0 },
    { size: "1536x1024" as const, ratio: 1.5 },
    { size: "1024x1536" as const, ratio: 1 / 1.5 },
  ];
  let best = options[0];
  let bestDiff = Math.abs(Math.log(aspectRatio / best.ratio));
  for (const opt of options) {
    const diff = Math.abs(Math.log(aspectRatio / opt.ratio));
    if (diff < bestDiff) {
      best = opt;
      bestDiff = diff;
    }
  }
  return best.size;
}

/**
 * Convert our mask format (white=edit, black=preserve) to OpenAI format
 * OpenAI expects: alpha=0 in edit zones, opaque in preserve zones
 * Also ensures the mask matches the reference image dimensions
 */
async function convertMaskForOpenAI(maskUrl: string, refImageBuffer?: Buffer): Promise<Buffer> {
  const sharp = (await import("sharp")).default;
  const resp = await fetch(maskUrl.startsWith("http") ? maskUrl : `http://localhost:3000${maskUrl}`);
  if (!resp.ok) throw new Error(`Failed to fetch mask: ${resp.status}`);
  let maskBuffer: Buffer = Buffer.from(await resp.arrayBuffer());

  // If we have a reference image, resize mask to match
  if (refImageBuffer) {
    const refMeta = await sharp(refImageBuffer).metadata();
    if (refMeta.width && refMeta.height) {
      maskBuffer = Buffer.from(await sharp(maskBuffer)
        .resize(refMeta.width, refMeta.height, { fit: "fill" })
        .toBuffer());
    }
  }

  // Convert: white pixels → alpha=0 (will be edited), black → opaque (preserved)
  const grey = sharp(maskBuffer).greyscale();
  const { data, info } = await grey.raw().toBuffer({ resolveWithObject: true });

  // Build RGBA: RGB irrelevant, alpha = 255 - greyValue
  const rgba = Buffer.alloc(info.width * info.height * 4);
  for (let i = 0; i < info.width * info.height; i++) {
    const g = data[i];
    rgba[i * 4] = 255;
    rgba[i * 4 + 1] = 255;
    rgba[i * 4 + 2] = 255;
    rgba[i * 4 + 3] = 255 - g; // white→0 (edit), black→255 (preserve)
  }

  return sharp(rgba, { raw: { width: info.width, height: info.height, channels: 4 } }).png().toBuffer();
}

/**
 * Crop a generated image (PNG buffer) to the target aspect ratio
 * Centers the crop (smart crop attempt: keep center)
 */
async function cropToAspectRatio(buffer: Buffer, targetAspect: number, targetWidth?: number, targetHeight?: number): Promise<Buffer> {
  const sharp = (await import("sharp")).default;
  const img = sharp(buffer);
  const meta = await img.metadata();
  if (!meta.width || !meta.height) return buffer;

  const currentAspect = meta.width / meta.height;
  // If already close enough, skip crop
  if (Math.abs(currentAspect - targetAspect) < 0.02) {
    if (targetWidth && targetHeight) {
      return img.resize(targetWidth, targetHeight, { fit: "fill" }).png().toBuffer();
    }
    return buffer;
  }

  let cropW = meta.width;
  let cropH = meta.height;
  if (currentAspect > targetAspect) {
    // too wide, crop width
    cropW = Math.round(meta.height * targetAspect);
  } else {
    // too tall, crop height
    cropH = Math.round(meta.width / targetAspect);
  }
  const left = Math.round((meta.width - cropW) / 2);
  const top = Math.round((meta.height - cropH) / 2);

  let pipeline = img.extract({ left, top, width: cropW, height: cropH });
  if (targetWidth && targetHeight) {
    pipeline = pipeline.resize(targetWidth, targetHeight, { fit: "fill" });
  }
  return pipeline.png().toBuffer();
}

/**
 * Try Gemini first, fallback to OpenAI gpt-image-1 if Gemini fails
 */
export async function generateImage(
  options: GenerateImageOptions
): Promise<GenerateImageResponse> {
  // Compute target aspect ratio
  let targetAspect: number | null = parseAspectRatio(options.targetAspectRatio);
  if (!targetAspect && options.targetWidth && options.targetHeight) {
    targetAspect = options.targetWidth / options.targetHeight;
  }

  // Try Gemini first
  if (process.env.GEMINI_API_KEY) {
    try {
      const result = await generateWithGemini(options, targetAspect);
      if (result.url) return { ...result, provider: "gemini" };
    } catch (err: any) {
      console.warn(`[ImageGen] Gemini failed (${err.message?.slice(0, 100)}), falling back to OpenAI...`);
    }
  }

  // Fallback to OpenAI
  if (process.env.OPENAI_API_KEY) {
    const result = await generateWithOpenAI(options, targetAspect);
    return { ...result, provider: "openai" };
  }

  throw new Error("No image generation provider configured (need GEMINI_API_KEY or OPENAI_API_KEY)");
}

async function generateWithGemini(options: GenerateImageOptions, targetAspect: number | null): Promise<GenerateImageResponse> {
  const apiKey = process.env.GEMINI_API_KEY!;
  const url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent";

  const parts: any[] = [{ text: options.prompt }];

  if (options.originalImages) {
    for (const img of options.originalImages) {
      if (img.url && img.url.startsWith("http")) {
        try {
          const resp = await fetch(img.url);
          if (resp.ok) {
            const buffer = Buffer.from(await resp.arrayBuffer());
            const mime = img.mimeType || resp.headers.get("content-type") || "image/jpeg";
            parts.push({
              inlineData: { mimeType: mime, data: buffer.toString("base64") },
            });
          }
        } catch (e) {
          console.warn("[Gemini] Failed to download reference image:", e);
        }
      } else if (img.b64Json) {
        parts.push({
          inlineData: { mimeType: img.mimeType || "image/png", data: img.b64Json },
        });
      }
    }
  }

  const payload = {
    contents: [{ parts }],
    generationConfig: { responseModalities: ["TEXT", "IMAGE"] },
  };

  const response = await fetch(`${url}?key=${apiKey}`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify(payload),
  });

  if (!response.ok) {
    const detail = await response.text().catch(() => "");
    throw new Error(`Gemini ${response.status}: ${detail.slice(0, 150)}`);
  }

  const result = await response.json() as any;
  for (const candidate of result.candidates || []) {
    for (const part of candidate.content?.parts || []) {
      if (part.inlineData?.data) {
        let buffer: Buffer = Buffer.from(part.inlineData.data, "base64");
        // Crop to target aspect if specified
        if (targetAspect) {
          buffer = Buffer.from(await cropToAspectRatio(buffer, targetAspect, options.targetWidth, options.targetHeight));
        }
        const { url: storageUrl } = await storagePut(
          `generated/${Date.now()}.png`,
          buffer,
          "image/png"
        );
        return { url: storageUrl };
      }
    }
  }

  throw new Error("Gemini: no image in response");
}

async function generateWithOpenAI(options: GenerateImageOptions, targetAspect: number | null): Promise<GenerateImageResponse> {
  const apiKey = process.env.OPENAI_API_KEY!;
  const hasReferenceImage = options.originalImages && options.originalImages.length > 0 && options.originalImages[0].url;
  const isInpainting = !!options.maskUrl;

  // Pick best size based on aspect ratio (default to landscape 16:9 → 1536x1024)
  const size = pickOpenAISize(targetAspect ?? 1.78);

  let response: Response;

  if (hasReferenceImage) {
    const formData = new FormData();
    formData.append("model", "gpt-image-1");
    formData.append("prompt", options.prompt);
    formData.append("n", "1");
    formData.append("size", size);

    // Download reference images and (for inpainting) attach the first one as image
    let firstImageBuffer: Buffer | undefined;

    for (let i = 0; i < options.originalImages!.length; i++) {
      const img = options.originalImages![i];
      if (img.url && img.url.startsWith("http")) {
        try {
          const resp = await fetch(img.url);
          if (!resp.ok) continue;
          const arrayBuffer = await resp.arrayBuffer();
          const buffer = Buffer.from(arrayBuffer);
          if (i === 0) firstImageBuffer = buffer;
          const mime = img.mimeType || resp.headers.get("content-type") || "image/png";
          const ext = mime.includes("png") ? "png" : mime.includes("jpeg") ? "jpg" : "png";
          const blob = new Blob([buffer], { type: mime });
          // For inpainting, OpenAI wants "image" (singular), for edits with refs use "image[]"
          const fieldName = isInpainting && i === 0 ? "image" : "image[]";
          formData.append(fieldName, blob, `reference_${i}.${ext}`);
        } catch (e) {
          console.warn("[OpenAI] Failed to download reference:", e);
        }
      }
    }

    // Attach mask for inpainting
    if (isInpainting && firstImageBuffer) {
      try {
        const openaiMask = await convertMaskForOpenAI(options.maskUrl!, firstImageBuffer);
        const maskBytes = new Uint8Array(openaiMask);
        const maskBlob = new Blob([maskBytes], { type: "image/png" });
        formData.append("mask", maskBlob, "mask.png");
      } catch (e) {
        console.warn("[OpenAI] Failed to prepare mask, falling back to full edit:", e);
      }
    }

    response = await fetch("https://api.openai.com/v1/images/edits", {
      method: "POST",
      headers: { Authorization: `Bearer ${apiKey}` },
      body: formData,
    });
  } else {
    response = await fetch("https://api.openai.com/v1/images/generations", {
      method: "POST",
      headers: {
        Authorization: `Bearer ${apiKey}`,
        "Content-Type": "application/json",
      },
      body: JSON.stringify({
        model: "gpt-image-1",
        prompt: options.prompt,
        n: 1,
        size,
      }),
    });
  }

  if (!response.ok) {
    const detail = await response.text().catch(() => "");
    throw new Error(`OpenAI ${response.status}: ${detail.slice(0, 200)}`);
  }

  const result = await response.json() as any;
  const img = result.data?.[0];
  if (!img) throw new Error("OpenAI: no image in response");

  let buffer: Buffer;
  if (img.b64_json) {
    buffer = Buffer.from(img.b64_json, "base64");
  } else if (img.url) {
    const downloadResp = await fetch(img.url);
    if (!downloadResp.ok) throw new Error("OpenAI: failed to download generated image");
    buffer = Buffer.from(await downloadResp.arrayBuffer());
  } else {
    throw new Error("OpenAI: no b64_json or url in response");
  }

  // Crop to target aspect if specified (OpenAI gives 3:2, we want exact source ratio)
  if (targetAspect) {
    buffer = Buffer.from(await cropToAspectRatio(buffer, targetAspect, options.targetWidth, options.targetHeight));
  }

  const { url: storageUrl } = await storagePut(
    `generated/${Date.now()}.png`,
    buffer,
    "image/png"
  );
  return { url: storageUrl };
}