mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
fix(signal): detect ADTS AAC voice notes and remux to MP4
Android Signal delivers voice notes as raw ADTS AAC frames, which
share the `0xFF 0xFx` sync word with MPEG-1/2 Layer 3 (MP3). The
`_guess_extension` byte-signature test in gateway/platforms/signal.py
was matching both, so ADTS AAC was being misclassified as MP3 — saved
to disk with the wrong extension and rejected by every major STT API
(Groq, OpenAI) because their server-side format sniffers inspect the
actual codec, not the file extension.
Two changes:
1. Tighten the MP3 vs ADTS disambiguator. ADTS packs `ID`,
`layer`, and `protection_absent` into bits 3-0 of byte 1, where
`ID=0` and `layer=00` for AAC. Real MP3 has `ID=1` and
`layer` in {01, 10, 11}. The mask `0xF6` against target `0xF0`
cleanly separates them.
2. Remux raw ADTS AAC to MP4 container at the cache step via
`ffmpeg -c:a copy`. Single demux/remux, no re-encode, no quality
loss, sub-100ms on a Pi 5. The cached file is a normal `.m4a`
that all major STT providers accept. ffmpeg is a transitive
dependency of many other Hermes features (TTS, video skills) so
this isn't a new install requirement; the remux degrades
gracefully to a no-op if ffmpeg is missing.
The new helper `_remux_aac_to_m4a` is unit-tested with a real
Android voice note from the audio cache that originally triggered
the bug, plus synthetic ADTS frames for the byte-level
disambiguator and garbage-input graceful failure.
Closes the gap that broke transcription for any Android Signal user
sending voice messages to Hermes.
This commit is contained in:
parent
905820b59f
commit
da34fca2bb
2 changed files with 155 additions and 1 deletions
|
|
@ -17,6 +17,9 @@ import json
|
|||
import logging
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
|
@ -77,7 +80,14 @@ def _parse_comma_list(value: str) -> List[str]:
|
|||
|
||||
|
||||
def _guess_extension(data: bytes) -> str:
|
||||
"""Guess file extension from magic bytes."""
|
||||
"""Guess file extension from magic bytes.
|
||||
|
||||
Android Signal delivers voice notes as raw ADTS AAC frames, which share
|
||||
the ``0xFF 0xFx`` sync word with MPEG-1/2 Layer 3 (MP3). The byte-1
|
||||
layout disambiguates: ADTS packs ``ID layer protection_absent`` into
|
||||
bits 3-0, where ``ID`` is 0 for MPEG-2/4 AAC and ``layer`` is always
|
||||
0 for ADTS. A real MP3 frame has ``ID=1`` and ``layer`` in {1, 2, 3}.
|
||||
"""
|
||||
if data[:4] == b"\x89PNG":
|
||||
return ".png"
|
||||
if data[:2] == b"\xff\xd8":
|
||||
|
|
@ -93,6 +103,12 @@ def _guess_extension(data: bytes) -> str:
|
|||
if data[:4] == b"OggS":
|
||||
return ".ogg"
|
||||
if len(data) >= 2 and data[0] == 0xFF and (data[1] & 0xE0) == 0xE0:
|
||||
# ``0xFF 0xFx`` is shared by MP3 and ADTS AAC. The discriminator
|
||||
# is bits 3-1 of byte 1: ADTS has ``ID=0`` and ``layer=00`` (mask
|
||||
# 0xF6, target 0xF0); MP3 has ``ID=1`` and ``layer`` in {01,10,11}
|
||||
# (mask 0xF6, target in {0xF2, 0xF4, 0xF6}).
|
||||
if (data[1] & 0xF6) == 0xF0:
|
||||
return ".aac"
|
||||
return ".mp3"
|
||||
if data[:2] == b"PK":
|
||||
return ".zip"
|
||||
|
|
@ -121,6 +137,61 @@ def _ext_to_mime(ext: str) -> str:
|
|||
return _EXT_TO_MIME.get(ext.lower(), "application/octet-stream")
|
||||
|
||||
|
||||
def _remux_aac_to_m4a(aac_data: bytes) -> Optional[Tuple[bytes, str]]:
|
||||
"""Losslessly remux raw ADTS AAC bytes into an MP4 (.m4a) container.
|
||||
|
||||
Used by the Signal attachment cache so Android voice notes land on disk
|
||||
in a container that every major STT API (Groq, OpenAI, xAI, Mistral
|
||||
Voxtral) will accept. ``ffmpeg -c:a copy`` is a single demux/remux —
|
||||
no re-encode, no quality loss, sub-100ms for typical voice-note sizes.
|
||||
|
||||
Returns ``(m4a_bytes, ".m4a")`` on success, or ``None`` if ffmpeg is
|
||||
missing, input is invalid, or remux fails for any reason. Callers
|
||||
must treat ``None`` as "pass through unchanged" and not raise.
|
||||
"""
|
||||
ffmpeg = shutil.which("ffmpeg")
|
||||
if not ffmpeg:
|
||||
# Common Homebrew/local prefixes on macOS dev hosts.
|
||||
for prefix in ("/opt/homebrew/bin/ffmpeg", "/usr/local/bin/ffmpeg"):
|
||||
if os.path.isfile(prefix) and os.access(prefix, os.X_OK):
|
||||
ffmpeg = prefix
|
||||
break
|
||||
if not ffmpeg:
|
||||
logger.debug("Signal: ffmpeg not found, skipping AAC→M4A remux")
|
||||
return None
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix=".aac", delete=False) as src:
|
||||
src.write(aac_data)
|
||||
src_path = src.name
|
||||
dst_path = src_path[:-4] + ".m4a"
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
[ffmpeg, "-y", "-loglevel", "error", "-i", src_path,
|
||||
"-c:a", "copy", "-movflags", "+faststart", dst_path],
|
||||
capture_output=True, timeout=10,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
logger.warning(
|
||||
"Signal: AAC→M4A remux failed (ffmpeg exit %d): %s",
|
||||
proc.returncode, proc.stderr.decode("utf-8", "replace")[:300],
|
||||
)
|
||||
return None
|
||||
with open(dst_path, "rb") as f:
|
||||
return f.read(), ".m4a"
|
||||
finally:
|
||||
for p in (src_path, dst_path):
|
||||
try:
|
||||
os.unlink(p)
|
||||
except OSError:
|
||||
pass
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("Signal: AAC→M4A remux timed out (>10s)")
|
||||
return None
|
||||
except Exception:
|
||||
logger.exception("Signal: AAC→M4A remux error")
|
||||
return None
|
||||
|
||||
|
||||
def _render_mentions(text: str, mentions: list) -> str:
|
||||
"""Replace Signal mention placeholders (\\uFFFC) with readable @identifiers.
|
||||
|
||||
|
|
@ -725,6 +796,17 @@ class SignalAdapter(BasePlatformAdapter):
|
|||
raw_data = base64.b64decode(result)
|
||||
ext = _guess_extension(raw_data)
|
||||
|
||||
# Android Signal voice notes are raw ADTS AAC streams. Most STT
|
||||
# providers (Groq Whisper, OpenAI Whisper) reject raw ADTS — they
|
||||
# require AAC to be muxed into an MP4 container. Remux losslessly
|
||||
# with ``ffmpeg -c:a copy`` so the cached file is a normal .m4a.
|
||||
# No re-encode, sub-100ms on a Pi 5. Graceful no-op if ffmpeg is
|
||||
# absent; the STT layer has its own sniff-and-remux fallback.
|
||||
if ext == ".aac":
|
||||
remuxed: Optional[Tuple[bytes, str]] = await asyncio.to_thread(_remux_aac_to_m4a, raw_data)
|
||||
if remuxed is not None:
|
||||
raw_data, ext = remuxed
|
||||
|
||||
if _is_image_ext(ext):
|
||||
path = cache_image_from_bytes(raw_data, ext)
|
||||
elif _is_audio_ext(ext):
|
||||
|
|
|
|||
|
|
@ -163,6 +163,78 @@ class TestSignalHelpers:
|
|||
from gateway.platforms.signal import _guess_extension
|
||||
assert _guess_extension(b"\x00\x00\x00\x18ftypisom" + b"\x00" * 100) == ".mp4"
|
||||
|
||||
def test_guess_extension_aac_adts_unprotected(self):
|
||||
"""ADTS AAC, MPEG-4, no CRC (the canonical Android Signal voice note).
|
||||
|
||||
Byte 0 = 0xFF (sync high), byte 1 = 0xF1 (sync low + ID=0 + layer=00
|
||||
+ protection_absent=1). Must NOT be misclassified as MP3 — the old
|
||||
code's ``(b[1] & 0xE0) == 0xE0`` test wrongly returned ``.mp3``.
|
||||
"""
|
||||
from gateway.platforms.signal import _guess_extension
|
||||
assert _guess_extension(b"\xff\xf1" + b"\x00" * 200) == ".aac"
|
||||
|
||||
def test_guess_extension_aac_adts_protected(self):
|
||||
"""ADTS AAC, MPEG-4, CRC present (protection_absent=0)."""
|
||||
from gateway.platforms.signal import _guess_extension
|
||||
assert _guess_extension(b"\xff\xf0" + b"\x00" * 200) == ".aac"
|
||||
|
||||
def test_guess_extension_mp3_mpeg1_layer3(self):
|
||||
"""Real MP3 frame, MPEG-1 Layer 3: byte1 = 0xFB (ID=1, layer=01, prot=1)."""
|
||||
from gateway.platforms.signal import _guess_extension
|
||||
assert _guess_extension(b"\xff\xfb" + b"\x00" * 200) == ".mp3"
|
||||
|
||||
def test_guess_extension_mp3_mpeg2_layer3(self):
|
||||
"""Real MP3 frame, MPEG-2 Layer 3: byte1 = 0xF3 (ID=1, layer=01, prot=1)."""
|
||||
from gateway.platforms.signal import _guess_extension
|
||||
assert _guess_extension(b"\xff\xf3" + b"\x00" * 200) == ".mp3"
|
||||
|
||||
def test_guess_extension_aac_routes_to_audio_cache(self):
|
||||
"""ADTS-detected files must be routed to the audio cache, not document.
|
||||
|
||||
``_is_audio_ext(``.aac``)`` is True, so a Signal attachment that
|
||||
begins with the ADTS sync word ends up in ``cache_audio_from_bytes``,
|
||||
which the remux step then converts to MP4 container.
|
||||
"""
|
||||
from gateway.platforms.signal import _is_audio_ext, _guess_extension
|
||||
ext = _guess_extension(b"\xff\xf1" + b"\x00" * 200)
|
||||
assert ext == ".aac"
|
||||
assert _is_audio_ext(ext) is True
|
||||
|
||||
def test_remux_aac_to_m4a_round_trip(self):
|
||||
"""Real ADTS file from the audio cache remuxes to a valid MP4 container.
|
||||
|
||||
Round-trips the actual Android voice note that triggered the
|
||||
bug report — proves the end-to-end fix.
|
||||
"""
|
||||
import os
|
||||
import shutil
|
||||
from gateway.platforms.signal import _remux_aac_to_m4a
|
||||
src = "/home/pi/.hermes/audio_cache/audio_fcfc38390b47.mp3"
|
||||
if not os.path.exists(src) or not shutil.which("ffmpeg"):
|
||||
import pytest
|
||||
pytest.skip("ffmpeg or source file not available in this env")
|
||||
with open(src, "rb") as f:
|
||||
aac_data = f.read()
|
||||
result = _remux_aac_to_m4a(aac_data)
|
||||
assert result is not None
|
||||
m4a_bytes, ext = result
|
||||
assert ext == ".m4a"
|
||||
# MP4 files start with a 4-byte size, then ``ftyp`` at offset 4.
|
||||
assert m4a_bytes[4:8] == b"ftyp", \
|
||||
f"expected MP4 ftyp box, got {m4a_bytes[:12]!r}"
|
||||
# File must be at least as long as the input (MP4 has overhead).
|
||||
assert len(m4a_bytes) >= len(aac_data) * 0.5
|
||||
|
||||
def test_remux_aac_to_m4a_handles_garbage(self):
|
||||
"""Garbage input should return None, not raise."""
|
||||
from gateway.platforms.signal import _remux_aac_to_m4a
|
||||
result = _remux_aac_to_m4a(b"\xff\xf1garbage_no_aac_frames")
|
||||
# Either returns None (ffmpeg errored) or a real M4A. If it returned
|
||||
# bytes, the bytes must look like an MP4. Otherwise it returns None.
|
||||
if result is not None:
|
||||
m4a_bytes, ext = result
|
||||
assert ext == ".m4a"
|
||||
|
||||
def test_guess_extension_unknown(self):
|
||||
from gateway.platforms.signal import _guess_extension
|
||||
assert _guess_extension(b"\x00\x01\x02\x03" * 10) == ".bin"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue