mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(discord): strip RTP padding before DAVE/Opus decode (#11267)
The Discord voice receive path skipped RFC 3550 §5.1 padding handling, passing padding-contaminated payloads into DAVE E2EE decrypt and Opus decode. Symptoms in live VC sessions: deaf inbound speech, intermittent empty STT results, "corrupted stream" decode errors — especially on the first reply after join. When the P bit is set in the RTP header, the last payload byte holds the count of trailing padding bytes (including itself) that must be removed. Receive pipeline now follows the spec order: 1. RTP header parse 2. NaCl transport decrypt (aead_xchacha20_poly1305_rtpsize) 3. strip encrypted RTP extension data from start 4. strip RTP padding from end if P bit set ← was missing 5. DAVE inner media decrypt 6. Opus decode Drops malformed packets where pad_len is 0 or exceeds payload length. Adds 7 integration tests covering valid padded packets, the X+P combined case, padding under DAVE passthrough, and three malformed-padding paths. Closes #11267 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
6ba4bb6b8e
commit
c1c9ab534c
2 changed files with 177 additions and 0 deletions
|
|
@ -235,6 +235,7 @@ class VoiceReceiver:
|
||||||
# Calculate dynamic RTP header size (RFC 9335 / rtpsize mode)
|
# Calculate dynamic RTP header size (RFC 9335 / rtpsize mode)
|
||||||
cc = first_byte & 0x0F # CSRC count
|
cc = first_byte & 0x0F # CSRC count
|
||||||
has_extension = bool(first_byte & 0x10) # extension bit
|
has_extension = bool(first_byte & 0x10) # extension bit
|
||||||
|
has_padding = bool(first_byte & 0x20) # padding bit (RFC 3550 §5.1)
|
||||||
header_size = 12 + (4 * cc) + (4 if has_extension else 0)
|
header_size = 12 + (4 * cc) + (4 if has_extension else 0)
|
||||||
|
|
||||||
if len(data) < header_size + 4: # need at least header + nonce
|
if len(data) < header_size + 4: # need at least header + nonce
|
||||||
|
|
@ -278,6 +279,31 @@ class VoiceReceiver:
|
||||||
if ext_data_len and len(decrypted) > ext_data_len:
|
if ext_data_len and len(decrypted) > ext_data_len:
|
||||||
decrypted = decrypted[ext_data_len:]
|
decrypted = decrypted[ext_data_len:]
|
||||||
|
|
||||||
|
# --- Strip RTP padding (RFC 3550 §5.1) ---
|
||||||
|
# When the P bit is set, the last payload byte holds the count of
|
||||||
|
# trailing padding bytes (including itself) that must be removed
|
||||||
|
# before further processing. Skipping this passes padding-contaminated
|
||||||
|
# bytes into DAVE/Opus and corrupts inbound audio.
|
||||||
|
if has_padding:
|
||||||
|
if not decrypted:
|
||||||
|
if self._packet_debug_count <= 10:
|
||||||
|
logger.warning(
|
||||||
|
"RTP padding bit set but no payload (ssrc=%d)", ssrc,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
pad_len = decrypted[-1]
|
||||||
|
if pad_len == 0 or pad_len > len(decrypted):
|
||||||
|
if self._packet_debug_count <= 10:
|
||||||
|
logger.warning(
|
||||||
|
"Invalid RTP padding length %d for payload size %d (ssrc=%d)",
|
||||||
|
pad_len, len(decrypted), ssrc,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
decrypted = decrypted[:-pad_len]
|
||||||
|
if not decrypted:
|
||||||
|
# Padding consumed entire payload — nothing to decode
|
||||||
|
return
|
||||||
|
|
||||||
# --- DAVE E2EE decrypt ---
|
# --- DAVE E2EE decrypt ---
|
||||||
if self._dave_session:
|
if self._dave_session:
|
||||||
with self._lock:
|
with self._lock:
|
||||||
|
|
|
||||||
|
|
@ -73,6 +73,50 @@ def _build_encrypted_rtp_packet(secret_key, opus_payload, ssrc=100, seq=1, times
|
||||||
return header + ciphertext + nonce_counter
|
return header + ciphertext + nonce_counter
|
||||||
|
|
||||||
|
|
||||||
|
def _build_padded_rtp_packet(
|
||||||
|
secret_key, opus_payload, pad_len, ssrc=100, seq=1, timestamp=960,
|
||||||
|
declared_pad_len=None, ext_words=0,
|
||||||
|
):
|
||||||
|
"""Build a NaCl-encrypted RTP packet with the P bit set and padding appended.
|
||||||
|
|
||||||
|
Per RFC 3550 §5.1, the last padding byte declares how many trailing bytes
|
||||||
|
(including itself) to discard. ``pad_len`` is the actual padding appended;
|
||||||
|
``declared_pad_len`` lets a test forge a mismatched declared length to
|
||||||
|
exercise the validation path. ``ext_words`` > 0 also sets the X bit and
|
||||||
|
prepends a synthetic extension block (4-byte preamble in cleartext header,
|
||||||
|
ext_words*4 bytes of encrypted extension data prepended to the payload).
|
||||||
|
"""
|
||||||
|
if pad_len < 1:
|
||||||
|
raise ValueError("pad_len must be >= 1 (last byte includes itself)")
|
||||||
|
declared = pad_len if declared_pad_len is None else declared_pad_len
|
||||||
|
if declared < 0 or declared > 255:
|
||||||
|
raise ValueError("declared_pad_len must fit in one byte")
|
||||||
|
|
||||||
|
has_extension = ext_words > 0
|
||||||
|
first_byte = 0xA0 | (0x10 if has_extension else 0) # V=2, P=1, [X=?], CC=0
|
||||||
|
fixed_header = struct.pack(">BBHII", first_byte, 0x78, seq, timestamp, ssrc)
|
||||||
|
if has_extension:
|
||||||
|
# 4-byte extension preamble: 2 bytes "defined by profile" + 2 bytes length-in-words
|
||||||
|
ext_preamble = struct.pack(">HH", 0xBEDE, ext_words)
|
||||||
|
header = fixed_header + ext_preamble
|
||||||
|
ext_data = b"\xab" * (ext_words * 4)
|
||||||
|
else:
|
||||||
|
header = fixed_header
|
||||||
|
ext_data = b""
|
||||||
|
|
||||||
|
padding = b"\x00" * (pad_len - 1) + bytes([declared])
|
||||||
|
plaintext = ext_data + opus_payload + padding
|
||||||
|
|
||||||
|
box = nacl.secret.Aead(secret_key)
|
||||||
|
nonce_counter = struct.pack(">I", seq)
|
||||||
|
full_nonce = nonce_counter + b"\x00" * 20
|
||||||
|
|
||||||
|
enc_msg = box.encrypt(plaintext, header, full_nonce)
|
||||||
|
ciphertext = enc_msg.ciphertext
|
||||||
|
|
||||||
|
return header + ciphertext + nonce_counter
|
||||||
|
|
||||||
|
|
||||||
def _make_voice_receiver(secret_key, dave_session=None, bot_ssrc=9999,
|
def _make_voice_receiver(secret_key, dave_session=None, bot_ssrc=9999,
|
||||||
allowed_user_ids=None, members=None):
|
allowed_user_ids=None, members=None):
|
||||||
"""Create a VoiceReceiver with real secret key."""
|
"""Create a VoiceReceiver with real secret key."""
|
||||||
|
|
@ -212,6 +256,113 @@ class TestRealNaClWithDAVE:
|
||||||
assert len(receiver._buffers.get(100, b"")) == 0
|
assert len(receiver._buffers.get(100, b"")) == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestRTPPaddingStrip:
|
||||||
|
"""RFC 3550 §5.1 — strip RTP padding before DAVE/Opus decode."""
|
||||||
|
|
||||||
|
def test_padded_packet_stripped_and_buffered(self):
|
||||||
|
"""P bit set → trailing padding stripped → opus payload decoded."""
|
||||||
|
key = _make_secret_key()
|
||||||
|
opus_silence = b"\xf8\xff\xfe"
|
||||||
|
receiver = _make_voice_receiver(key)
|
||||||
|
|
||||||
|
# 5 bytes of padding (4 zeros + count byte = 5)
|
||||||
|
packet = _build_padded_rtp_packet(key, opus_silence, pad_len=5, ssrc=100)
|
||||||
|
receiver._on_packet(packet)
|
||||||
|
|
||||||
|
assert 100 in receiver._buffers
|
||||||
|
assert len(receiver._buffers[100]) > 0
|
||||||
|
|
||||||
|
def test_padded_packet_matches_unpadded_output(self):
|
||||||
|
"""Same opus payload with/without padding → same decoded PCM."""
|
||||||
|
key = _make_secret_key()
|
||||||
|
opus_silence = b"\xf8\xff\xfe"
|
||||||
|
|
||||||
|
recv_plain = _make_voice_receiver(key)
|
||||||
|
recv_plain._on_packet(
|
||||||
|
_build_encrypted_rtp_packet(key, opus_silence, ssrc=100)
|
||||||
|
)
|
||||||
|
|
||||||
|
recv_padded = _make_voice_receiver(key)
|
||||||
|
recv_padded._on_packet(
|
||||||
|
_build_padded_rtp_packet(key, opus_silence, pad_len=7, ssrc=100)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert bytes(recv_plain._buffers[100]) == bytes(recv_padded._buffers[100])
|
||||||
|
|
||||||
|
def test_padding_with_dave_passthrough(self):
|
||||||
|
"""Padding stripped before DAVE → passthrough buffers cleanly."""
|
||||||
|
key = _make_secret_key()
|
||||||
|
opus_silence = b"\xf8\xff\xfe"
|
||||||
|
dave = MagicMock() # SSRC unmapped → DAVE skipped, passthrough used
|
||||||
|
receiver = _make_voice_receiver(key, dave_session=dave)
|
||||||
|
|
||||||
|
packet = _build_padded_rtp_packet(key, opus_silence, pad_len=4, ssrc=100)
|
||||||
|
receiver._on_packet(packet)
|
||||||
|
|
||||||
|
dave.decrypt.assert_not_called()
|
||||||
|
assert 100 in receiver._buffers
|
||||||
|
assert len(receiver._buffers[100]) > 0
|
||||||
|
|
||||||
|
def test_invalid_padding_length_zero_dropped(self):
|
||||||
|
"""Declared pad_len=0 is invalid (RFC requires count includes itself)."""
|
||||||
|
key = _make_secret_key()
|
||||||
|
opus_silence = b"\xf8\xff\xfe"
|
||||||
|
receiver = _make_voice_receiver(key)
|
||||||
|
|
||||||
|
packet = _build_padded_rtp_packet(
|
||||||
|
key, opus_silence, pad_len=4, declared_pad_len=0, ssrc=100
|
||||||
|
)
|
||||||
|
receiver._on_packet(packet)
|
||||||
|
|
||||||
|
assert len(receiver._buffers.get(100, b"")) == 0
|
||||||
|
|
||||||
|
def test_invalid_padding_length_overflow_dropped(self):
|
||||||
|
"""Declared pad_len > payload size → packet dropped."""
|
||||||
|
key = _make_secret_key()
|
||||||
|
opus_silence = b"\xf8\xff\xfe"
|
||||||
|
receiver = _make_voice_receiver(key)
|
||||||
|
|
||||||
|
packet = _build_padded_rtp_packet(
|
||||||
|
key, opus_silence, pad_len=4, declared_pad_len=255, ssrc=100
|
||||||
|
)
|
||||||
|
receiver._on_packet(packet)
|
||||||
|
|
||||||
|
assert len(receiver._buffers.get(100, b"")) == 0
|
||||||
|
|
||||||
|
def test_padding_consuming_entire_payload_dropped(self):
|
||||||
|
"""Padding consumes entire payload → no opus data → dropped."""
|
||||||
|
key = _make_secret_key()
|
||||||
|
receiver = _make_voice_receiver(key)
|
||||||
|
|
||||||
|
# Empty opus payload, 6 bytes of padding (count byte declares 6)
|
||||||
|
packet = _build_padded_rtp_packet(key, b"", pad_len=6, ssrc=100)
|
||||||
|
receiver._on_packet(packet)
|
||||||
|
|
||||||
|
assert len(receiver._buffers.get(100, b"")) == 0
|
||||||
|
|
||||||
|
def test_padding_with_extension_stripped_correctly(self):
|
||||||
|
"""X+P bits both set → strip extension from start, padding from end."""
|
||||||
|
key = _make_secret_key()
|
||||||
|
opus_silence = b"\xf8\xff\xfe"
|
||||||
|
|
||||||
|
# Same opus payload sent two ways: plain, and with both ext+padding
|
||||||
|
recv_plain = _make_voice_receiver(key)
|
||||||
|
recv_plain._on_packet(
|
||||||
|
_build_encrypted_rtp_packet(key, opus_silence, ssrc=100)
|
||||||
|
)
|
||||||
|
|
||||||
|
recv_ext_pad = _make_voice_receiver(key)
|
||||||
|
recv_ext_pad._on_packet(
|
||||||
|
_build_padded_rtp_packet(
|
||||||
|
key, opus_silence, pad_len=5, ext_words=2, ssrc=100
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Both must yield identical decoded PCM — ext data and padding both
|
||||||
|
# stripped before opus decode.
|
||||||
|
assert bytes(recv_plain._buffers[100]) == bytes(recv_ext_pad._buffers[100])
|
||||||
|
|
||||||
|
|
||||||
class TestFullVoiceFlow:
|
class TestFullVoiceFlow:
|
||||||
"""End-to-end: encrypt → receive → buffer → silence detect → complete."""
|
"""End-to-end: encrypt → receive → buffer → silence detect → complete."""
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue