fix(discord): strip RTP padding before DAVE/Opus decode (#11267)

The Discord voice receive path skipped RFC 3550 §5.1 padding handling,
passing padding-contaminated payloads into DAVE E2EE decrypt and Opus
decode. Symptoms in live VC sessions: deaf inbound speech, intermittent
empty STT results, "corrupted stream" decode errors — especially on the
first reply after join.

When the P bit is set in the RTP header, the last payload byte holds the
count of trailing padding bytes (including itself) that must be removed.
Receive pipeline now follows the spec order:

  1. RTP header parse
  2. NaCl transport decrypt (aead_xchacha20_poly1305_rtpsize)
  3. strip encrypted RTP extension data from start
  4. strip RTP padding from end if P bit set  ← was missing
  5. DAVE inner media decrypt
  6. Opus decode

Drops malformed packets where pad_len is 0 or exceeds payload length.

Adds 7 integration tests covering valid padded packets, the X+P combined
case, padding under DAVE passthrough, and three malformed-padding paths.

Closes #11267

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Michel Belleau 2026-04-16 19:30:28 -04:00 committed by Teknium
parent 6ba4bb6b8e
commit c1c9ab534c
2 changed files with 177 additions and 0 deletions

View file

@ -235,6 +235,7 @@ class VoiceReceiver:
# Calculate dynamic RTP header size (RFC 9335 / rtpsize mode) # Calculate dynamic RTP header size (RFC 9335 / rtpsize mode)
cc = first_byte & 0x0F # CSRC count cc = first_byte & 0x0F # CSRC count
has_extension = bool(first_byte & 0x10) # extension bit has_extension = bool(first_byte & 0x10) # extension bit
has_padding = bool(first_byte & 0x20) # padding bit (RFC 3550 §5.1)
header_size = 12 + (4 * cc) + (4 if has_extension else 0) header_size = 12 + (4 * cc) + (4 if has_extension else 0)
if len(data) < header_size + 4: # need at least header + nonce if len(data) < header_size + 4: # need at least header + nonce
@ -278,6 +279,31 @@ class VoiceReceiver:
if ext_data_len and len(decrypted) > ext_data_len: if ext_data_len and len(decrypted) > ext_data_len:
decrypted = decrypted[ext_data_len:] decrypted = decrypted[ext_data_len:]
# --- Strip RTP padding (RFC 3550 §5.1) ---
# When the P bit is set, the last payload byte holds the count of
# trailing padding bytes (including itself) that must be removed
# before further processing. Skipping this passes padding-contaminated
# bytes into DAVE/Opus and corrupts inbound audio.
if has_padding:
if not decrypted:
if self._packet_debug_count <= 10:
logger.warning(
"RTP padding bit set but no payload (ssrc=%d)", ssrc,
)
return
pad_len = decrypted[-1]
if pad_len == 0 or pad_len > len(decrypted):
if self._packet_debug_count <= 10:
logger.warning(
"Invalid RTP padding length %d for payload size %d (ssrc=%d)",
pad_len, len(decrypted), ssrc,
)
return
decrypted = decrypted[:-pad_len]
if not decrypted:
# Padding consumed entire payload — nothing to decode
return
# --- DAVE E2EE decrypt --- # --- DAVE E2EE decrypt ---
if self._dave_session: if self._dave_session:
with self._lock: with self._lock:

View file

@ -73,6 +73,50 @@ def _build_encrypted_rtp_packet(secret_key, opus_payload, ssrc=100, seq=1, times
return header + ciphertext + nonce_counter return header + ciphertext + nonce_counter
def _build_padded_rtp_packet(
secret_key, opus_payload, pad_len, ssrc=100, seq=1, timestamp=960,
declared_pad_len=None, ext_words=0,
):
"""Build a NaCl-encrypted RTP packet with the P bit set and padding appended.
Per RFC 3550 §5.1, the last padding byte declares how many trailing bytes
(including itself) to discard. ``pad_len`` is the actual padding appended;
``declared_pad_len`` lets a test forge a mismatched declared length to
exercise the validation path. ``ext_words`` > 0 also sets the X bit and
prepends a synthetic extension block (4-byte preamble in cleartext header,
ext_words*4 bytes of encrypted extension data prepended to the payload).
"""
if pad_len < 1:
raise ValueError("pad_len must be >= 1 (last byte includes itself)")
declared = pad_len if declared_pad_len is None else declared_pad_len
if declared < 0 or declared > 255:
raise ValueError("declared_pad_len must fit in one byte")
has_extension = ext_words > 0
first_byte = 0xA0 | (0x10 if has_extension else 0) # V=2, P=1, [X=?], CC=0
fixed_header = struct.pack(">BBHII", first_byte, 0x78, seq, timestamp, ssrc)
if has_extension:
# 4-byte extension preamble: 2 bytes "defined by profile" + 2 bytes length-in-words
ext_preamble = struct.pack(">HH", 0xBEDE, ext_words)
header = fixed_header + ext_preamble
ext_data = b"\xab" * (ext_words * 4)
else:
header = fixed_header
ext_data = b""
padding = b"\x00" * (pad_len - 1) + bytes([declared])
plaintext = ext_data + opus_payload + padding
box = nacl.secret.Aead(secret_key)
nonce_counter = struct.pack(">I", seq)
full_nonce = nonce_counter + b"\x00" * 20
enc_msg = box.encrypt(plaintext, header, full_nonce)
ciphertext = enc_msg.ciphertext
return header + ciphertext + nonce_counter
def _make_voice_receiver(secret_key, dave_session=None, bot_ssrc=9999, def _make_voice_receiver(secret_key, dave_session=None, bot_ssrc=9999,
allowed_user_ids=None, members=None): allowed_user_ids=None, members=None):
"""Create a VoiceReceiver with real secret key.""" """Create a VoiceReceiver with real secret key."""
@ -212,6 +256,113 @@ class TestRealNaClWithDAVE:
assert len(receiver._buffers.get(100, b"")) == 0 assert len(receiver._buffers.get(100, b"")) == 0
class TestRTPPaddingStrip:
"""RFC 3550 §5.1 — strip RTP padding before DAVE/Opus decode."""
def test_padded_packet_stripped_and_buffered(self):
"""P bit set → trailing padding stripped → opus payload decoded."""
key = _make_secret_key()
opus_silence = b"\xf8\xff\xfe"
receiver = _make_voice_receiver(key)
# 5 bytes of padding (4 zeros + count byte = 5)
packet = _build_padded_rtp_packet(key, opus_silence, pad_len=5, ssrc=100)
receiver._on_packet(packet)
assert 100 in receiver._buffers
assert len(receiver._buffers[100]) > 0
def test_padded_packet_matches_unpadded_output(self):
"""Same opus payload with/without padding → same decoded PCM."""
key = _make_secret_key()
opus_silence = b"\xf8\xff\xfe"
recv_plain = _make_voice_receiver(key)
recv_plain._on_packet(
_build_encrypted_rtp_packet(key, opus_silence, ssrc=100)
)
recv_padded = _make_voice_receiver(key)
recv_padded._on_packet(
_build_padded_rtp_packet(key, opus_silence, pad_len=7, ssrc=100)
)
assert bytes(recv_plain._buffers[100]) == bytes(recv_padded._buffers[100])
def test_padding_with_dave_passthrough(self):
"""Padding stripped before DAVE → passthrough buffers cleanly."""
key = _make_secret_key()
opus_silence = b"\xf8\xff\xfe"
dave = MagicMock() # SSRC unmapped → DAVE skipped, passthrough used
receiver = _make_voice_receiver(key, dave_session=dave)
packet = _build_padded_rtp_packet(key, opus_silence, pad_len=4, ssrc=100)
receiver._on_packet(packet)
dave.decrypt.assert_not_called()
assert 100 in receiver._buffers
assert len(receiver._buffers[100]) > 0
def test_invalid_padding_length_zero_dropped(self):
"""Declared pad_len=0 is invalid (RFC requires count includes itself)."""
key = _make_secret_key()
opus_silence = b"\xf8\xff\xfe"
receiver = _make_voice_receiver(key)
packet = _build_padded_rtp_packet(
key, opus_silence, pad_len=4, declared_pad_len=0, ssrc=100
)
receiver._on_packet(packet)
assert len(receiver._buffers.get(100, b"")) == 0
def test_invalid_padding_length_overflow_dropped(self):
"""Declared pad_len > payload size → packet dropped."""
key = _make_secret_key()
opus_silence = b"\xf8\xff\xfe"
receiver = _make_voice_receiver(key)
packet = _build_padded_rtp_packet(
key, opus_silence, pad_len=4, declared_pad_len=255, ssrc=100
)
receiver._on_packet(packet)
assert len(receiver._buffers.get(100, b"")) == 0
def test_padding_consuming_entire_payload_dropped(self):
"""Padding consumes entire payload → no opus data → dropped."""
key = _make_secret_key()
receiver = _make_voice_receiver(key)
# Empty opus payload, 6 bytes of padding (count byte declares 6)
packet = _build_padded_rtp_packet(key, b"", pad_len=6, ssrc=100)
receiver._on_packet(packet)
assert len(receiver._buffers.get(100, b"")) == 0
def test_padding_with_extension_stripped_correctly(self):
"""X+P bits both set → strip extension from start, padding from end."""
key = _make_secret_key()
opus_silence = b"\xf8\xff\xfe"
# Same opus payload sent two ways: plain, and with both ext+padding
recv_plain = _make_voice_receiver(key)
recv_plain._on_packet(
_build_encrypted_rtp_packet(key, opus_silence, ssrc=100)
)
recv_ext_pad = _make_voice_receiver(key)
recv_ext_pad._on_packet(
_build_padded_rtp_packet(
key, opus_silence, pad_len=5, ext_words=2, ssrc=100
)
)
# Both must yield identical decoded PCM — ext data and padding both
# stripped before opus decode.
assert bytes(recv_plain._buffers[100]) == bytes(recv_ext_pad._buffers[100])
class TestFullVoiceFlow: class TestFullVoiceFlow:
"""End-to-end: encrypt → receive → buffer → silence detect → complete.""" """End-to-end: encrypt → receive → buffer → silence detect → complete."""