From dad62c4c474164b19cfd7b5e96746a2cdde50931 Mon Sep 17 00:00:00 2001 From: Bartok Date: Tue, 21 Apr 2026 16:29:27 -0400 Subject: [PATCH] fix(whatsapp): auto-convert mp3/wav to ogg/opus in send-media for native voice bubbles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WhatsApp bridge (bridge.js) only sets ptt:true when file extension is .ogg or .opus, causing mp3/wav files (from Edge TTS, NeuTTS, etc.) to arrive as file attachments instead of voice bubbles — silently, with no error. Fix: when audio type is sent with a non-ogg/opus format, run ffmpeg conversion to ogg/opus in a temp file before sending. This makes send_voice() self-sufficient regardless of what format the caller provides. Fallback: if ffmpeg is unavailable, original buffer is sent (previous behaviour) with a console.warn — no crash. Addresses veloguardian's review comment on PR #4992. --- scripts/whatsapp-bridge/bridge.js | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/scripts/whatsapp-bridge/bridge.js b/scripts/whatsapp-bridge/bridge.js index d1aeb73722..af6d6b54a0 100644 --- a/scripts/whatsapp-bridge/bridge.js +++ b/scripts/whatsapp-bridge/bridge.js @@ -23,8 +23,10 @@ import express from 'express'; import { Boom } from '@hapi/boom'; import pino from 'pino'; import path from 'path'; -import { mkdirSync, readFileSync, writeFileSync, existsSync, readdirSync } from 'fs'; +import { mkdirSync, readFileSync, writeFileSync, existsSync, readdirSync, unlinkSync } from 'fs'; import { randomBytes } from 'crypto'; +import { execSync } from 'child_process'; +import { tmpdir } from 'os'; import qrcode from 'qrcode-terminal'; import { matchesAllowedUser, parseAllowedUsers } from './allowlist.js'; @@ -505,8 +507,31 @@ app.post('/send-media', async (req, res) => { msgPayload = { video: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'video/mp4' }; break; case 'audio': { - const audioMime = (ext === 'ogg' || ext === 'opus') ? 'audio/ogg; codecs=opus' : 'audio/mpeg'; - msgPayload = { audio: buffer, mimetype: audioMime, ptt: ext === 'ogg' || ext === 'opus' }; + // WhatsApp only renders a native voice bubble (ptt) when the file is ogg/opus. + // If the caller passes mp3, wav, m4a etc. (e.g. from Edge TTS / NeuTTS), + // silently convert to ogg/opus via ffmpeg so ptt is always honoured. + let audioBuffer = buffer; + let audioExt = ext; + const needsConversion = !['ogg', 'opus'].includes(ext); + let tmpPath = null; + if (needsConversion) { + tmpPath = path.join(tmpdir(), `hermes_voice_${randomBytes(6).toString('hex')}.ogg`); + try { + execSync( + `ffmpeg -y -i ${JSON.stringify(filePath)} -ar 48000 -ac 1 -c:a libopus ${JSON.stringify(tmpPath)}`, + { timeout: 30000, stdio: 'pipe' } + ); + audioBuffer = readFileSync(tmpPath); + audioExt = 'ogg'; + } catch (convErr) { + // ffmpeg not available or conversion failed — fall back to original format + console.warn('[bridge] ffmpeg conversion failed, sending as file attachment:', convErr.message); + } finally { + try { if (tmpPath && existsSync(tmpPath)) unlinkSync(tmpPath); } catch (_) {} + } + } + const audioMime = (audioExt === 'ogg' || audioExt === 'opus') ? 'audio/ogg; codecs=opus' : 'audio/mpeg'; + msgPayload = { audio: audioBuffer, mimetype: audioMime, ptt: audioExt === 'ogg' || audioExt === 'opus' }; break; } case 'document':