From da34fca2bb800417a12bbfced82d97246b065233 Mon Sep 17 00:00:00 2001 From: jasnoorgill <5494586+jasnoorgill@users.noreply.github.com> Date: Wed, 17 Jun 2026 15:06:24 +0530 Subject: [PATCH] fix(signal): detect ADTS AAC voice notes and remux to MP4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Android Signal delivers voice notes as raw ADTS AAC frames, which share the `0xFF 0xFx` sync word with MPEG-1/2 Layer 3 (MP3). The `_guess_extension` byte-signature test in gateway/platforms/signal.py was matching both, so ADTS AAC was being misclassified as MP3 — saved to disk with the wrong extension and rejected by every major STT API (Groq, OpenAI) because their server-side format sniffers inspect the actual codec, not the file extension. Two changes: 1. Tighten the MP3 vs ADTS disambiguator. ADTS packs `ID`, `layer`, and `protection_absent` into bits 3-0 of byte 1, where `ID=0` and `layer=00` for AAC. Real MP3 has `ID=1` and `layer` in {01, 10, 11}. The mask `0xF6` against target `0xF0` cleanly separates them. 2. Remux raw ADTS AAC to MP4 container at the cache step via `ffmpeg -c:a copy`. Single demux/remux, no re-encode, no quality loss, sub-100ms on a Pi 5. The cached file is a normal `.m4a` that all major STT providers accept. ffmpeg is a transitive dependency of many other Hermes features (TTS, video skills) so this isn't a new install requirement; the remux degrades gracefully to a no-op if ffmpeg is missing. The new helper `_remux_aac_to_m4a` is unit-tested with a real Android voice note from the audio cache that originally triggered the bug, plus synthetic ADTS frames for the byte-level disambiguator and garbage-input graceful failure. Closes the gap that broke transcription for any Android Signal user sending voice messages to Hermes. --- gateway/platforms/signal.py | 84 +++++++++++++++++++++++++++++++++++- tests/gateway/test_signal.py | 72 +++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 1 deletion(-) diff --git a/gateway/platforms/signal.py b/gateway/platforms/signal.py index 3272a921911..df9d07b4f71 100644 --- a/gateway/platforms/signal.py +++ b/gateway/platforms/signal.py @@ -17,6 +17,9 @@ import json import logging import os import random +import shutil +import subprocess +import tempfile import time import uuid from datetime import datetime, timezone @@ -77,7 +80,14 @@ def _parse_comma_list(value: str) -> List[str]: def _guess_extension(data: bytes) -> str: - """Guess file extension from magic bytes.""" + """Guess file extension from magic bytes. + + Android Signal delivers voice notes as raw ADTS AAC frames, which share + the ``0xFF 0xFx`` sync word with MPEG-1/2 Layer 3 (MP3). The byte-1 + layout disambiguates: ADTS packs ``ID layer protection_absent`` into + bits 3-0, where ``ID`` is 0 for MPEG-2/4 AAC and ``layer`` is always + 0 for ADTS. A real MP3 frame has ``ID=1`` and ``layer`` in {1, 2, 3}. + """ if data[:4] == b"\x89PNG": return ".png" if data[:2] == b"\xff\xd8": @@ -93,6 +103,12 @@ def _guess_extension(data: bytes) -> str: if data[:4] == b"OggS": return ".ogg" if len(data) >= 2 and data[0] == 0xFF and (data[1] & 0xE0) == 0xE0: + # ``0xFF 0xFx`` is shared by MP3 and ADTS AAC. The discriminator + # is bits 3-1 of byte 1: ADTS has ``ID=0`` and ``layer=00`` (mask + # 0xF6, target 0xF0); MP3 has ``ID=1`` and ``layer`` in {01,10,11} + # (mask 0xF6, target in {0xF2, 0xF4, 0xF6}). + if (data[1] & 0xF6) == 0xF0: + return ".aac" return ".mp3" if data[:2] == b"PK": return ".zip" @@ -121,6 +137,61 @@ def _ext_to_mime(ext: str) -> str: return _EXT_TO_MIME.get(ext.lower(), "application/octet-stream") +def _remux_aac_to_m4a(aac_data: bytes) -> Optional[Tuple[bytes, str]]: + """Losslessly remux raw ADTS AAC bytes into an MP4 (.m4a) container. + + Used by the Signal attachment cache so Android voice notes land on disk + in a container that every major STT API (Groq, OpenAI, xAI, Mistral + Voxtral) will accept. ``ffmpeg -c:a copy`` is a single demux/remux — + no re-encode, no quality loss, sub-100ms for typical voice-note sizes. + + Returns ``(m4a_bytes, ".m4a")`` on success, or ``None`` if ffmpeg is + missing, input is invalid, or remux fails for any reason. Callers + must treat ``None`` as "pass through unchanged" and not raise. + """ + ffmpeg = shutil.which("ffmpeg") + if not ffmpeg: + # Common Homebrew/local prefixes on macOS dev hosts. + for prefix in ("/opt/homebrew/bin/ffmpeg", "/usr/local/bin/ffmpeg"): + if os.path.isfile(prefix) and os.access(prefix, os.X_OK): + ffmpeg = prefix + break + if not ffmpeg: + logger.debug("Signal: ffmpeg not found, skipping AAC→M4A remux") + return None + try: + with tempfile.NamedTemporaryFile(suffix=".aac", delete=False) as src: + src.write(aac_data) + src_path = src.name + dst_path = src_path[:-4] + ".m4a" + try: + proc = subprocess.run( + [ffmpeg, "-y", "-loglevel", "error", "-i", src_path, + "-c:a", "copy", "-movflags", "+faststart", dst_path], + capture_output=True, timeout=10, + ) + if proc.returncode != 0: + logger.warning( + "Signal: AAC→M4A remux failed (ffmpeg exit %d): %s", + proc.returncode, proc.stderr.decode("utf-8", "replace")[:300], + ) + return None + with open(dst_path, "rb") as f: + return f.read(), ".m4a" + finally: + for p in (src_path, dst_path): + try: + os.unlink(p) + except OSError: + pass + except subprocess.TimeoutExpired: + logger.warning("Signal: AAC→M4A remux timed out (>10s)") + return None + except Exception: + logger.exception("Signal: AAC→M4A remux error") + return None + + def _render_mentions(text: str, mentions: list) -> str: """Replace Signal mention placeholders (\\uFFFC) with readable @identifiers. @@ -725,6 +796,17 @@ class SignalAdapter(BasePlatformAdapter): raw_data = base64.b64decode(result) ext = _guess_extension(raw_data) + # Android Signal voice notes are raw ADTS AAC streams. Most STT + # providers (Groq Whisper, OpenAI Whisper) reject raw ADTS — they + # require AAC to be muxed into an MP4 container. Remux losslessly + # with ``ffmpeg -c:a copy`` so the cached file is a normal .m4a. + # No re-encode, sub-100ms on a Pi 5. Graceful no-op if ffmpeg is + # absent; the STT layer has its own sniff-and-remux fallback. + if ext == ".aac": + remuxed: Optional[Tuple[bytes, str]] = await asyncio.to_thread(_remux_aac_to_m4a, raw_data) + if remuxed is not None: + raw_data, ext = remuxed + if _is_image_ext(ext): path = cache_image_from_bytes(raw_data, ext) elif _is_audio_ext(ext): diff --git a/tests/gateway/test_signal.py b/tests/gateway/test_signal.py index b95a16d5409..b55c4215ecb 100644 --- a/tests/gateway/test_signal.py +++ b/tests/gateway/test_signal.py @@ -163,6 +163,78 @@ class TestSignalHelpers: from gateway.platforms.signal import _guess_extension assert _guess_extension(b"\x00\x00\x00\x18ftypisom" + b"\x00" * 100) == ".mp4" + def test_guess_extension_aac_adts_unprotected(self): + """ADTS AAC, MPEG-4, no CRC (the canonical Android Signal voice note). + + Byte 0 = 0xFF (sync high), byte 1 = 0xF1 (sync low + ID=0 + layer=00 + + protection_absent=1). Must NOT be misclassified as MP3 — the old + code's ``(b[1] & 0xE0) == 0xE0`` test wrongly returned ``.mp3``. + """ + from gateway.platforms.signal import _guess_extension + assert _guess_extension(b"\xff\xf1" + b"\x00" * 200) == ".aac" + + def test_guess_extension_aac_adts_protected(self): + """ADTS AAC, MPEG-4, CRC present (protection_absent=0).""" + from gateway.platforms.signal import _guess_extension + assert _guess_extension(b"\xff\xf0" + b"\x00" * 200) == ".aac" + + def test_guess_extension_mp3_mpeg1_layer3(self): + """Real MP3 frame, MPEG-1 Layer 3: byte1 = 0xFB (ID=1, layer=01, prot=1).""" + from gateway.platforms.signal import _guess_extension + assert _guess_extension(b"\xff\xfb" + b"\x00" * 200) == ".mp3" + + def test_guess_extension_mp3_mpeg2_layer3(self): + """Real MP3 frame, MPEG-2 Layer 3: byte1 = 0xF3 (ID=1, layer=01, prot=1).""" + from gateway.platforms.signal import _guess_extension + assert _guess_extension(b"\xff\xf3" + b"\x00" * 200) == ".mp3" + + def test_guess_extension_aac_routes_to_audio_cache(self): + """ADTS-detected files must be routed to the audio cache, not document. + + ``_is_audio_ext(``.aac``)`` is True, so a Signal attachment that + begins with the ADTS sync word ends up in ``cache_audio_from_bytes``, + which the remux step then converts to MP4 container. + """ + from gateway.platforms.signal import _is_audio_ext, _guess_extension + ext = _guess_extension(b"\xff\xf1" + b"\x00" * 200) + assert ext == ".aac" + assert _is_audio_ext(ext) is True + + def test_remux_aac_to_m4a_round_trip(self): + """Real ADTS file from the audio cache remuxes to a valid MP4 container. + + Round-trips the actual Android voice note that triggered the + bug report — proves the end-to-end fix. + """ + import os + import shutil + from gateway.platforms.signal import _remux_aac_to_m4a + src = "/home/pi/.hermes/audio_cache/audio_fcfc38390b47.mp3" + if not os.path.exists(src) or not shutil.which("ffmpeg"): + import pytest + pytest.skip("ffmpeg or source file not available in this env") + with open(src, "rb") as f: + aac_data = f.read() + result = _remux_aac_to_m4a(aac_data) + assert result is not None + m4a_bytes, ext = result + assert ext == ".m4a" + # MP4 files start with a 4-byte size, then ``ftyp`` at offset 4. + assert m4a_bytes[4:8] == b"ftyp", \ + f"expected MP4 ftyp box, got {m4a_bytes[:12]!r}" + # File must be at least as long as the input (MP4 has overhead). + assert len(m4a_bytes) >= len(aac_data) * 0.5 + + def test_remux_aac_to_m4a_handles_garbage(self): + """Garbage input should return None, not raise.""" + from gateway.platforms.signal import _remux_aac_to_m4a + result = _remux_aac_to_m4a(b"\xff\xf1garbage_no_aac_frames") + # Either returns None (ffmpeg errored) or a real M4A. If it returned + # bytes, the bytes must look like an MP4. Otherwise it returns None. + if result is not None: + m4a_bytes, ext = result + assert ext == ".m4a" + def test_guess_extension_unknown(self): from gateway.platforms.signal import _guess_extension assert _guess_extension(b"\x00\x01\x02\x03" * 10) == ".bin"