From 8a9ded5b21b11355fe08425e2d1c7763acd17842 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 5 Jun 2026 03:10:40 -0700 Subject: [PATCH] =?UTF-8?q?feat(discord):=20voice-channel=20mixer=20?= =?UTF-8?q?=E2=80=94=20ambient=20idle=20bed=20+=20verbal=20acks=20that=20o?= =?UTF-8?q?verlap=20TTS=20(#39659)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(discord): voice-channel mixer — ambient idle bed + verbal acks that overlap TTS Discord voice mode can now feel conversational: the bot speaks a short acknowledgement before it starts working, and a subtle ambient 'thinking' bed plays underneath while tools run, ducking under speech and swelling back — the Grok-voice-mode feel. discord.py plays only one audio stream per voice connection, so this adds a software mixer (VoiceMixer, a discord.AudioSource) installed once per guild on join. It sums an ambient loop, verbal acks, and TTS replies into that single 20ms/48kHz/stereo stream (numpy int16 add + clip), so they overlap instead of stop-and-swap. Speech ducks the ambient gain down and releases it smoothly. - plugins/platforms/discord/voice_mixer.py: VoiceMixer + MixerChild (gain, loop, fade, duck/release), decode_to_pcm (ffmpeg), synth_ambient_pcm (no asset needed — synthesised pad). - adapter: install mixer on join, tear down on leave, route play_in_voice_channel through the mixer (legacy one-shot path kept as fallback), play_ack_in_voice, voice_mixer_active. Defensive getattr for the object.__new__ test helpers. - gateway/run.py: tool_start_callback fires a one-time verbal ack on the first tool call of a turn when in a voice channel (independent of the text tool-progress gate). No system-prompt or message-flow changes. - config: discord.voice_fx.* (OFF by default; ambient/duck/speech gains, ack phrases). All in config.yaml, not .env. - docs + tests (mixer unit + adapter integration). Verified: 19 new tests pass, existing voice suite green (2 pre-existing davey-module env failures unchanged), and a real-mixer E2E confirms ambient streams, TTS overlaps it, acks layer in, and teardown is clean. * fix(discord): make voice mixer numpy import lazy (numpy is voice-extra-only) numpy ships in the optional 'voice' extra, not [all,dev], so a module-level 'import numpy' broke CI test collection (and would break the always-imported Discord adapter on any install without the voice extra). Defer numpy to the functions that actually mix audio via _require_numpy(); guard the test module with pytest.importorskip('numpy'). --- gateway/run.py | 46 +++ hermes_cli/config.py | 22 ++ plugins/platforms/discord/adapter.py | 212 ++++++++++- plugins/platforms/discord/voice_mixer.py | 378 +++++++++++++++++++ tests/gateway/test_discord_voice_mixer.py | 264 +++++++++++++ website/docs/user-guide/messaging/discord.md | 31 ++ 6 files changed, 952 insertions(+), 1 deletion(-) create mode 100644 plugins/platforms/discord/voice_mixer.py create mode 100644 tests/gateway/test_discord_voice_mixer.py diff --git a/gateway/run.py b/gateway/run.py index 7887ec23c3a..7fc4da59f5e 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -17024,6 +17024,47 @@ class GatewayRunner: last_progress_msg = [None] # Track last message for dedup repeat_count = [0] # How many times the same message repeated + # ── Discord voice "verbal ack before tool calls" ──────────────── + # When the bot is in a voice channel with the continuous mixer + # installed (discord.voice_fx.enabled), speak a short phrase ("let me + # look into that") over the ambient idle bed on the FIRST tool call of + # the turn. Fires from tool_start_callback (independent of the + # tool-progress text gate), at most once per turn. No-op on every + # other platform / when not in a voice channel. + _voice_ack_fired = [False] + _voice_ack_guild: List[Optional[int]] = [None] + if source.platform == Platform.DISCORD: + _va = self.adapters.get(Platform.DISCORD) + # source.chat_id is the linked text channel; resolve the guild whose + # voice connection is bound to it (mirrors DiscordAdapter.play_tts). + _vtc = getattr(_va, "_voice_text_channels", None) + if isinstance(_vtc, dict) and hasattr(_va, "voice_mixer_active"): + for _gid, _tc in _vtc.items(): + if str(_tc) == str(source.chat_id) and _va.voice_mixer_active(_gid): + _voice_ack_guild[0] = _gid + break + _voice_ack_loop = asyncio.get_running_loop() + + def voice_ack_callback(call_id, tool_name, args): + """tool_start_callback: speak a one-time ack in the voice channel.""" + if _voice_ack_fired[0] or _voice_ack_guild[0] is None: + return + if not _run_still_current(): + return + _voice_ack_fired[0] = True + _adapter = self.adapters.get(Platform.DISCORD) + if _adapter is None or not hasattr(_adapter, "play_ack_in_voice"): + return + try: + safe_schedule_threadsafe( + _adapter.play_ack_in_voice(_voice_ack_guild[0]), + _voice_ack_loop, + logger=logger, + log_message="voice ack scheduling error", + ) + except Exception as _ack_err: + logger.debug("voice ack schedule failed: %s", _ack_err) + # Auto-cleanup of temporary progress bubbles (Telegram + any adapter # that implements ``delete_message``). When enabled via # ``display.platforms..cleanup_progress: true``, message IDs @@ -17834,6 +17875,11 @@ class GatewayRunner: # Per-message state — callbacks and reasoning config change every # turn and must not be baked into the cached agent constructor. agent.tool_progress_callback = progress_callback if tool_progress_enabled else None + # Discord voice verbal-ack hook (fires once per turn on first tool + # call; armed only when in a voice channel with the mixer running). + agent.tool_start_callback = ( + voice_ack_callback if _voice_ack_guild[0] is not None else None + ) agent.step_callback = _step_callback_sync if _hooks_ref.loaded_hooks else None agent.stream_delta_callback = _stream_delta_cb agent.interim_assistant_callback = _interim_assistant_cb if _want_interim_messages else None diff --git a/hermes_cli/config.py b/hermes_cli/config.py index d0b4493ddd5..a37c073c8a4 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1841,6 +1841,28 @@ DEFAULT_CONFIG = { # real memory cost. Default 32 MiB matches the historical hardcoded # cap. Set to 0 for no cap. Env override: DISCORD_MAX_ATTACHMENT_BYTES. "max_attachment_bytes": 33554432, + # Voice-channel audio effects (the continuous mixer). OFF by default. + # When enabled, the bot installs a software mixer on the outgoing voice + # stream so a low ambient "thinking" bed, verbal acknowledgements, and + # TTS replies can OVERLAP (ducking the ambient under speech) instead of + # stop-and-swap — the Grok-voice-mode feel. discord.py ships no mixer; + # this is implemented in plugins/platforms/discord/voice_mixer.py. + "voice_fx": { + "enabled": False, # master switch for the mixer subsystem + "ambient_enabled": True, # play the idle "thinking" bed while tools run + "ambient_path": "", # custom loop audio file; "" = synthesised pad + "ambient_gain": 0.18, # idle bed loudness, 0.0–1.0 + "duck_gain": 0.06, # ambient loudness while speech plays + "speech_gain": 1.0, # TTS / ack loudness, 0.0–1.0 + "ack_enabled": True, # speak a short phrase before the first tool call + "ack_phrases": [ # picked at random; set [] to disable phrases + "Let me look into that.", + "One moment.", + "Checking on that now.", + "Give me a sec.", + "On it.", + ], + }, }, # WhatsApp platform settings (gateway mode) diff --git a/plugins/platforms/discord/adapter.py b/plugins/platforms/discord/adapter.py index 11765176576..fa0f81c9b2e 100644 --- a/plugins/platforms/discord/adapter.py +++ b/plugins/platforms/discord/adapter.py @@ -600,6 +600,12 @@ class DiscordAdapter(BasePlatformAdapter): self._voice_listen_tasks: Dict[int, asyncio.Task] = {} # guild_id -> listen loop self._voice_input_callback: Optional[Callable] = None # set by run.py self._on_voice_disconnect: Optional[Callable] = None # set by run.py + # Phase 3: continuous voice mixer (ambient idle bed + ducked speech). + # Installed once per guild on join; lets acks / TTS / the "thinking" + # loop overlap in one outgoing stream instead of stop-and-swap. + self._voice_mixers: Dict[int, Any] = {} # guild_id -> VoiceMixer + self._ambient_pcm_cache: Optional[bytes] = None # decoded ambient bed + self._voice_fx_cfg: Dict[str, Any] = self._load_voice_fx_config() # Track threads where the bot has participated so follow-up messages # in those threads don't require @mention. Persisted to disk so the # set survives gateway restarts. @@ -1925,6 +1931,160 @@ class DiscordAdapter(BasePlatformAdapter): # Voice channel methods (join / leave / play) # ------------------------------------------------------------------ + def _load_voice_fx_config(self) -> Dict[str, Any]: + """Read voice mixer / ambient / ack settings from config.yaml. + + All settings live under ``discord.voice_fx`` in config.yaml (NOT the + .env file — these are behavioral, not secrets). The feature is OFF by + default; users opt in with ``discord.voice_fx.enabled: true``. + + Returns a dict with safe defaults so callers never KeyError. + """ + defaults: Dict[str, Any] = { + "enabled": False, # master switch for the mixer subsystem + "ambient_enabled": True, # idle "thinking" bed while tools run + "ambient_path": "", # optional custom loop file; "" = synthesised + "ambient_gain": 0.18, # idle bed loudness (0..1) + "duck_gain": 0.06, # ambient loudness while speech plays + "speech_gain": 1.0, # TTS / ack loudness + "ack_enabled": True, # speak a short phrase before tool calls + "ack_phrases": [ + "Let me look into that.", + "One moment.", + "Checking on that now.", + "Give me a sec.", + "On it.", + ], + } + try: + from hermes_cli.config import read_raw_config + cfg = read_raw_config() or {} + fx = ((cfg.get("discord") or {}).get("voice_fx") or {}) + if isinstance(fx, dict): + for k, v in fx.items(): + if k in defaults and v is not None: + defaults[k] = v + except Exception as e: + logger.debug("Could not load discord.voice_fx config: %s", e) + return defaults + + def _get_ambient_pcm(self) -> Optional[bytes]: + """Return decoded 48k/stereo/s16le PCM for the ambient idle bed. + + Uses a custom file when ``ambient_path`` is set and decodable, else a + synthesised pad. Cached after first build. + """ + if self._ambient_pcm_cache is not None: + return self._ambient_pcm_cache + if not self._voice_fx_cfg.get("ambient_enabled"): + return None + try: + from voice_mixer import decode_to_pcm, synth_ambient_pcm + except ImportError: + from .voice_mixer import decode_to_pcm, synth_ambient_pcm + + pcm: Optional[bytes] = None + path = (self._voice_fx_cfg.get("ambient_path") or "").strip() + if path and os.path.isfile(path): + pcm = decode_to_pcm(path) + if not pcm: + logger.warning("Ambient file %s failed to decode; using synth bed", path) + if not pcm: + pcm = synth_ambient_pcm() + self._ambient_pcm_cache = pcm + return pcm + + async def _install_voice_mixer(self, guild_id: int, vc) -> None: + """Create a VoiceMixer, start the ambient bed, and play it on the VC. + + The mixer runs continuously for the life of the connection: one + ``vc.play(mixer)`` call, never stopped until leave. + """ + try: + from voice_mixer import VoiceMixer + except ImportError: + from .voice_mixer import VoiceMixer + + mixer = VoiceMixer( + ambient_gain=float(self._voice_fx_cfg.get("ambient_gain", 0.18)), + duck_gain=float(self._voice_fx_cfg.get("duck_gain", 0.06)), + speech_gain=float(self._voice_fx_cfg.get("speech_gain", 1.0)), + ) + ambient = await asyncio.to_thread(self._get_ambient_pcm) + if ambient: + mixer.set_ambient(ambient) + + def _after(error): + if error: + logger.error("Voice mixer stream error (guild=%d): %s", guild_id, error) + + if vc.is_playing(): + vc.stop() + vc.play(mixer, after=_after) + self._voice_mixers[guild_id] = mixer + logger.info("Voice mixer installed (guild=%d, ambient=%s)", guild_id, bool(ambient)) + + async def play_ack_in_voice(self, guild_id: int, phrase: Optional[str] = None) -> bool: + """Speak a short acknowledgement over the ambient bed. + + Called from the gateway's tool-progress hook on the first tool call of + a turn, so the user hears "let me look into that" before the bot goes + quiet to work. No-op unless the mixer is installed and acks enabled. + """ + if not self._voice_fx_cfg.get("ack_enabled"): + return False + mixer = self._voice_mixers.get(guild_id) + if mixer is None: + return False + if phrase is None: + import random + phrases = self._voice_fx_cfg.get("ack_phrases") or ["One moment."] + phrase = random.choice(phrases) + + # Synthesise the ack via the configured TTS provider, then layer it. + import uuid as _uuid + audio_path = os.path.join( + tempfile.gettempdir(), "hermes_voice", + f"ack_{_uuid.uuid4().hex[:12]}.mp3", + ) + os.makedirs(os.path.dirname(audio_path), exist_ok=True) + try: + from tools.tts_tool import text_to_speech_tool + result_json = await asyncio.to_thread( + text_to_speech_tool, text=phrase, output_path=audio_path + ) + result = json.loads(result_json) + actual = result.get("file_path", audio_path) + if not result.get("success") or not os.path.isfile(actual): + return False + try: + from voice_mixer import decode_to_pcm + except ImportError: + from .voice_mixer import decode_to_pcm + pcm = await asyncio.to_thread(decode_to_pcm, actual) + if not pcm: + return False + mixer.play_speech( + pcm, gain=float(self._voice_fx_cfg.get("speech_gain", 1.0)) + ) + self._reset_voice_timeout(guild_id) + return True + except Exception as e: + logger.debug("play_ack_in_voice failed: %s", e) + return False + finally: + for p in {audio_path, locals().get("actual")}: + if p and os.path.isfile(p): + try: + os.unlink(p) + except OSError: + pass + + def voice_mixer_active(self, guild_id: int) -> bool: + """True when a continuous mixer is installed for this guild.""" + mixers = getattr(self, "_voice_mixers", None) + return bool(mixers) and mixers.get(guild_id) is not None + async def join_voice_channel(self, channel) -> bool: """Join a Discord voice channel. Returns True on success.""" if not self._client or not DISCORD_AVAILABLE: @@ -1957,6 +2117,15 @@ class DiscordAdapter(BasePlatformAdapter): except Exception as e: logger.warning("Voice receiver failed to start: %s", e) + # Phase 3: install the continuous mixer (ambient bed + ducked + # speech). Best-effort — if it fails we fall back to the legacy + # one-shot FFmpegPCMAudio playback path in play_in_voice_channel. + if getattr(self, "_voice_fx_cfg", {}).get("enabled"): + try: + await self._install_voice_mixer(guild_id, vc) + except Exception as e: + logger.warning("Voice mixer failed to start: %s", e) + return True async def leave_voice_channel(self, guild_id: int) -> None: @@ -1970,8 +2139,17 @@ class DiscordAdapter(BasePlatformAdapter): if listen_task: listen_task.cancel() + # Tear down the mixer (stops the continuous outgoing stream). + if getattr(self, "_voice_mixers", None) is not None: + self._voice_mixers.pop(guild_id, None) + vc = self._voice_clients.pop(guild_id, None) if vc and vc.is_connected(): + try: + if vc.is_playing(): + vc.stop() + except Exception: + pass await vc.disconnect() task = self._voice_timeout_tasks.pop(guild_id, None) if task: @@ -1983,11 +2161,43 @@ class DiscordAdapter(BasePlatformAdapter): PLAYBACK_TIMEOUT = 120 async def play_in_voice_channel(self, guild_id: int, audio_path: str) -> bool: - """Play an audio file in the connected voice channel.""" + """Play an audio file in the connected voice channel. + + When the continuous mixer is installed for this guild, the clip is + decoded to PCM and layered over the ambient bed (ducking it) so the + reply can overlap the idle "thinking" loop seamlessly. Otherwise we + fall back to the legacy one-shot FFmpegPCMAudio path. + """ vc = self._voice_clients.get(guild_id) if not vc or not vc.is_connected(): return False + # ── Mixer path (overlap + ducking) ────────────────────────────── + mixer = getattr(self, "_voice_mixers", {}).get(guild_id) if getattr(self, "_voice_mixers", None) else None + if mixer is not None: + try: + from voice_mixer import decode_to_pcm + except ImportError: + from .voice_mixer import decode_to_pcm + pcm = await asyncio.to_thread(decode_to_pcm, audio_path) + if pcm: + speech_gain = float(self._voice_fx_cfg.get("speech_gain", 1.0)) + mixer.play_speech(pcm, gain=speech_gain) + # Block until the speech child drains so callers serialise + # replies (mirrors legacy semantics) but the ambient keeps + # playing underneath the whole time. + wait_start = time.monotonic() + while mixer.speech_active: + if time.monotonic() - wait_start > self.PLAYBACK_TIMEOUT: + logger.warning("Mixer speech playback timed out after %ds", self.PLAYBACK_TIMEOUT) + mixer.stop_speech() + break + await asyncio.sleep(0.05) + self._reset_voice_timeout(guild_id) + return True + logger.warning("Mixer decode failed for %s; falling back to legacy playback", audio_path) + + # ── Legacy one-shot path (no mixer) ───────────────────────────── # Pause voice receiver while playing (echo prevention) receiver = self._voice_receivers.get(guild_id) if receiver: diff --git a/plugins/platforms/discord/voice_mixer.py b/plugins/platforms/discord/voice_mixer.py new file mode 100644 index 00000000000..c8f5b7eef3a --- /dev/null +++ b/plugins/platforms/discord/voice_mixer.py @@ -0,0 +1,378 @@ +from __future__ import annotations + +""" +Continuous PCM audio mixer for Discord voice channels. + +discord.py (Rapptz) ships no audio mixer: ``VoiceClient.play()`` accepts a +single :class:`discord.AudioSource` and raises ``ClientException`` if called +while already playing. One opus stream per connection, one source feeding it. + +This module adds software mixing *upstream* of that single stream. A +:class:`VoiceMixer` is itself a ``discord.AudioSource`` that discord.py polls +every 20 ms via :meth:`read`. Internally it sums the 20 ms PCM frames of any +number of child sources, clamps to int16, and returns one blended frame. +discord.py never knows several streams were combined underneath — it just +encodes and sends the single mixed frame. + +This gives us, for one voice connection at once: + + * an always-on low-volume **ambient/idle loop** (the "thinking" sound), + * a **speech** channel (TTS replies, verbal acknowledgements) that plays + *over* the ambient bed, automatically **ducking** the ambient gain down + while speech is active and restoring it when speech ends — the smooth + Grok-voice-mode feel, instead of stop-and-swap. + +Design notes +------------ +* The mixer is installed **once** per guild on join (``vc.play(mixer)``) and + runs continuously until the bot leaves. Children come and go; the mixer + itself never stops, so there is no ``is_playing()`` race between an + acknowledgement and the final reply. +* Frame format is Discord-native: 48 kHz, 2 channels, signed 16-bit LE, + 20 ms per frame == ``discord.opus.Encoder.FRAME_SIZE`` bytes + (3840 = 960 samples * 2 channels * 2 bytes). +* Mixing is a single vectorised int32 add + clip per 20 ms frame (numpy, + already a core dependency). CPU cost is negligible. +* :meth:`read` is called from discord.py's audio sender **thread**, while + children are added/removed from the asyncio event loop thread, so all + shared state is guarded by a plain ``threading.Lock``. + +The mixer NEVER touches the inbound receive path: it only produces the bot's +*outgoing* stream. The :class:`VoiceReceiver` decodes incoming SSRCs only, so +the mixer's output cannot echo back into transcription. +""" + +import logging +import threading +from typing import TYPE_CHECKING, List, Optional + +if TYPE_CHECKING: # numpy is an optional ("voice" extra) dep — never import at runtime top-level + import numpy as np + +logger = logging.getLogger(__name__) + + +def _require_numpy(): + """Import numpy lazily. + + numpy ships in the optional ``voice`` extra, not the base install, so this + module must import cleanly without it (the Discord adapter imports this + file unconditionally). Callers that actually mix audio call this; if the + voice extra isn't installed they get a clear error instead of a top-level + ImportError that would break the whole adapter import. + """ + import numpy as np # noqa: PLC0415 — intentional lazy import + return np + +# Discord-native frame geometry (matches discord.opus.Encoder). +SAMPLE_RATE = 48000 +CHANNELS = 2 +SAMPLE_WIDTH = 2 # bytes per sample (s16) +FRAME_LENGTH_MS = 20 +SAMPLES_PER_FRAME = SAMPLE_RATE * FRAME_LENGTH_MS // 1000 # 960 +FRAME_SIZE = SAMPLES_PER_FRAME * CHANNELS * SAMPLE_WIDTH # 3840 bytes +SILENCE_FRAME = b"\x00" * FRAME_SIZE + + +class MixerChild: + """A single audio stream feeding into :class:`VoiceMixer`. + + Wraps raw 48 kHz / stereo / s16le PCM bytes. ``read_frame`` hands back one + 20 ms frame at a time, optionally looping, with a per-child gain applied. + """ + + __slots__ = ( + "name", "_pcm", "_pos", "loop", "gain", + "is_speech", "fade_frames", "_fade_done", "_finished", + ) + + def __init__( + self, + name: str, + pcm: bytes, + *, + loop: bool = False, + gain: float = 1.0, + is_speech: bool = False, + fade_in_ms: int = 0, + ): + # Pad to a whole number of frames so looping is seamless and the final + # partial frame doesn't click. + remainder = len(pcm) % FRAME_SIZE + if remainder: + pcm = pcm + b"\x00" * (FRAME_SIZE - remainder) + self.name = name + self._pcm = pcm + self._pos = 0 + self.loop = loop + self.gain = float(gain) + self.is_speech = is_speech + # Linear fade-in over N frames avoids a click when a loud child starts. + self.fade_frames = max(0, fade_in_ms // FRAME_LENGTH_MS) + self._fade_done = 0 + self._finished = False + + @property + def finished(self) -> bool: + return self._finished + + def read_frame(self) -> "Optional[np.ndarray]": + """Return the next 20 ms frame as an int16 ndarray, or None if done.""" + if self._finished: + return None + if self._pos >= len(self._pcm): + if self.loop and self._pcm: + self._pos = 0 + else: + self._finished = True + return None + + np = _require_numpy() + chunk = self._pcm[self._pos:self._pos + FRAME_SIZE] + self._pos += FRAME_SIZE + if len(chunk) < FRAME_SIZE: + chunk = chunk + b"\x00" * (FRAME_SIZE - len(chunk)) + + samples = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) + + gain = self.gain + if self.fade_frames and self._fade_done < self.fade_frames: + self._fade_done += 1 + gain *= self._fade_done / self.fade_frames + + if gain != 1.0: + samples = samples * gain + return samples + + +class VoiceMixer: + """A continuous ``discord.AudioSource`` that mixes N child streams. + + Use :meth:`set_ambient` to install/replace the looping idle bed and + :meth:`play_speech` to layer a one-shot clip over it (ducking the ambient + while it plays). Both are safe to call from the asyncio loop thread while + discord.py drains :meth:`read` from its sender thread. + """ + + # discord.AudioSource subclasses set is_opus()==False to receive PCM. + def is_opus(self) -> bool: # pragma: no cover - trivial + return False + + def __init__( + self, + *, + ambient_gain: float = 0.18, + duck_gain: float = 0.06, + speech_gain: float = 1.0, + duck_release_ms: int = 400, + ): + self._lock = threading.Lock() + self._ambient: Optional[MixerChild] = None + self._speech: List[MixerChild] = [] + self._ambient_gain = float(ambient_gain) + self._duck_gain = float(duck_gain) + self._speech_gain = float(speech_gain) + # When speech ends, ramp the ambient back up over this many frames + # instead of jumping, so the bed swells back smoothly. + self._duck_release_frames = max(1, duck_release_ms // FRAME_LENGTH_MS) + self._duck_release_left = 0 + self._closed = False + # Tracks whether speech is currently active, for external callers that + # want to avoid double-ducking or know when a reply is mid-flight. + self._speech_active = False + + # ------------------------------------------------------------------ + # Ambient (idle / "thinking") bed + # ------------------------------------------------------------------ + + def set_ambient(self, pcm: Optional[bytes], *, gain: Optional[float] = None) -> None: + """Install (or clear, with ``pcm=None``) the looping ambient bed.""" + with self._lock: + if gain is not None: + self._ambient_gain = float(gain) + if not pcm: + self._ambient = None + return + self._ambient = MixerChild( + "ambient", pcm, loop=True, + gain=self._effective_ambient_gain(), fade_in_ms=200, + ) + + def _effective_ambient_gain(self) -> float: + return self._duck_gain if self._speech_active else self._ambient_gain + + # ------------------------------------------------------------------ + # Speech (TTS replies, verbal acks) layered over the ambient bed + # ------------------------------------------------------------------ + + def play_speech(self, pcm: bytes, *, gain: Optional[float] = None, + fade_in_ms: int = 40) -> None: + """Layer a one-shot speech clip over the ambient bed (ducks ambient).""" + if not pcm: + return + with self._lock: + child = MixerChild( + "speech", pcm, loop=False, + gain=self._speech_gain if gain is None else float(gain), + is_speech=True, fade_in_ms=fade_in_ms, + ) + self._speech.append(child) + self._speech_active = True + self._duck_release_left = 0 + if self._ambient is not None: + self._ambient.gain = self._duck_gain + + @property + def speech_active(self) -> bool: + with self._lock: + return self._speech_active + + def stop_speech(self) -> None: + """Drop any in-flight speech immediately and release the duck.""" + with self._lock: + self._speech.clear() + self._begin_duck_release_locked() + + def _begin_duck_release_locked(self) -> None: + self._speech_active = False + self._duck_release_left = self._duck_release_frames + + # ------------------------------------------------------------------ + # AudioSource interface — called from discord.py's sender thread + # ------------------------------------------------------------------ + + def read(self) -> bytes: + """Return one 20 ms mixed PCM frame (always FRAME_SIZE bytes). + + Returning a non-empty frame keeps discord.py's player alive; we never + return b"" because that would stop the single underlying stream and we + want the mixer to run continuously for the lifetime of the connection. + """ + with self._lock: + if self._closed: + return SILENCE_FRAME + + np = _require_numpy() + acc: "Optional[np.ndarray]" = None + + # Speech children (drop exhausted ones; release duck when last ends) + if self._speech: + still_live: List[MixerChild] = [] + for child in self._speech: + frame = child.read_frame() + if frame is None: + continue + acc = frame if acc is None else acc + frame + still_live.append(child) + self._speech = still_live + if not self._speech and self._speech_active: + self._begin_duck_release_locked() + + # Ambient bed — ramp gain back up during duck-release. + if self._ambient is not None: + if self._duck_release_left > 0 and not self._speech_active: + self._duck_release_left -= 1 + frac = 1.0 - (self._duck_release_left / self._duck_release_frames) + self._ambient.gain = ( + self._duck_gain + + (self._ambient_gain - self._duck_gain) * frac + ) + elif not self._speech_active and self._duck_release_left == 0: + self._ambient.gain = self._ambient_gain + amb = self._ambient.read_frame() + if amb is not None: + acc = amb if acc is None else acc + amb + + if acc is None: + return SILENCE_FRAME + + np.clip(acc, -32768, 32767, out=acc) + return acc.astype(np.int16).tobytes() + + def cleanup(self) -> None: # called by discord.py when playback stops + with self._lock: + self._closed = True + self._ambient = None + self._speech.clear() + + +# ---------------------------------------------------------------------- +# PCM helpers +# ---------------------------------------------------------------------- + +def decode_to_pcm(path: str, *, timeout: float = 30.0) -> Optional[bytes]: + """Decode any audio file to 48 kHz / stereo / s16le PCM via ffmpeg. + + Returns the raw PCM bytes, or None on failure. ffmpeg is already a hard + requirement of the voice path (see ``VoiceReceiver.pcm_to_wav``). + """ + import subprocess + + try: + proc = subprocess.run( + [ + "ffmpeg", "-y", "-loglevel", "error", + "-i", path, + "-f", "s16le", + "-ar", str(SAMPLE_RATE), + "-ac", str(CHANNELS), + "pipe:1", + ], + capture_output=True, + timeout=timeout, + ) + except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e: + logger.warning("decode_to_pcm failed for %s: %s", path, e) + return None + if proc.returncode != 0: + logger.warning( + "ffmpeg decode failed for %s (rc=%d): %s", + path, proc.returncode, (proc.stderr or b"").decode("utf-8", "replace")[:200], + ) + return None + return proc.stdout or None + + +def synth_ambient_pcm(seconds: float = 4.0) -> bytes: + """Synthesise a subtle looping ambient bed (no asset file required). + + A soft, slowly-pulsing low pad: two detuned sine partials with a gentle + tremolo, plus a touch of filtered noise. Designed to loop seamlessly + (whole number of cycles, zero-crossing endpoints) and sit quietly under + speech. Mono content duplicated to stereo. + """ + np = _require_numpy() + n = int(SAMPLE_RATE * seconds) + t = np.arange(n, dtype=np.float64) / SAMPLE_RATE + + # Choose base frequencies that complete whole cycles over the loop so the + # wrap point is click-free. + def _whole_cycle_freq(target: float) -> float: + cycles = max(1, round(target * seconds)) + return cycles / seconds + + f1 = _whole_cycle_freq(110.0) + f2 = _whole_cycle_freq(110.5) + trem = _whole_cycle_freq(0.5) # ~0.5 Hz tremolo + + pad = ( + 0.55 * np.sin(2 * np.pi * f1 * t) + + 0.45 * np.sin(2 * np.pi * f2 * t) + ) + tremolo = 0.6 + 0.4 * (0.5 * (1 + np.sin(2 * np.pi * trem * t))) + signal = pad * tremolo + + # Smooth filtered noise for air, kept very low. + rng = np.random.default_rng(7) + noise = rng.standard_normal(n) + kernel = np.ones(64) / 64.0 + noise = np.convolve(noise, kernel, mode="same") + signal = signal + 0.08 * noise + + # Normalise to a modest peak (mixer applies the real ambient gain on top). + peak = float(np.max(np.abs(signal))) or 1.0 + signal = (signal / peak) * 0.5 + + mono16 = (signal * 32767.0).astype(np.int16) + stereo16 = np.repeat(mono16[:, None], CHANNELS, axis=1).reshape(-1) + return stereo16.tobytes() diff --git a/tests/gateway/test_discord_voice_mixer.py b/tests/gateway/test_discord_voice_mixer.py new file mode 100644 index 00000000000..dae6da27f4b --- /dev/null +++ b/tests/gateway/test_discord_voice_mixer.py @@ -0,0 +1,264 @@ +"""Tests for the Discord continuous voice mixer (ambient + ducked speech) +and the verbal-ack-before-tool-calls hook. + +The mixer (plugins/platforms/discord/voice_mixer.py) is pure-PCM and has no +discord.py dependency, so its core is tested directly. The adapter +integration (install on join, play routing, ack) is tested with the standard +``object.__new__(DiscordAdapter)`` helper used elsewhere in the voice suite. +""" + +import os +import sys +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +# numpy ships only in the optional "voice" extra (not [all,dev]); the mixer +# math needs it, so skip this whole module when it isn't installed. +np = pytest.importorskip("numpy") + +# voice_mixer lives inside the discord plugin package dir; import by path the +# same way the adapter does. +_DISCORD_DIR = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "plugins", "platforms", "discord", +) +if _DISCORD_DIR not in sys.path: + sys.path.insert(0, _DISCORD_DIR) + +import voice_mixer as vm # noqa: E402 + + +# ===================================================================== +# Pure mixer unit tests +# ===================================================================== + +class TestVoiceMixerCore: + def test_frame_geometry_matches_discord(self): + # 20ms @ 48kHz stereo s16 == 3840 bytes (discord.opus.Encoder.FRAME_SIZE) + assert vm.FRAME_SIZE == 3840 + assert vm.SAMPLES_PER_FRAME == 960 + assert len(vm.SILENCE_FRAME) == vm.FRAME_SIZE + + def test_empty_mixer_returns_silence_frames(self): + mx = vm.VoiceMixer() + for _ in range(5): + frame = mx.read() + assert len(frame) == vm.FRAME_SIZE + assert frame == vm.SILENCE_FRAME + + def test_is_opus_false(self): + # discord.py sends raw PCM when is_opus() is False. + assert vm.VoiceMixer().is_opus() is False + + def test_ambient_loops_and_is_quiet(self): + mx = vm.VoiceMixer(ambient_gain=0.2) + amb = vm.synth_ambient_pcm(seconds=0.5) + assert len(amb) % vm.FRAME_SIZE == 0 # frame-aligned for seamless loop + mx.set_ambient(amb) + peaks = [int(np.max(np.abs(np.frombuffer(mx.read(), dtype=np.int16)))) + for _ in range(100)] # 2s >> 0.5s loop + # Produces audio after the fade-in and stays under the configured gain. + assert any(p > 0 for p in peaks[10:]) + assert max(peaks) < int(32767 * 0.5) + + def test_speech_audible_over_ambient_then_releases(self): + mx = vm.VoiceMixer(ambient_gain=0.2, duck_gain=0.05, duck_release_ms=200) + mx.set_ambient(vm.synth_ambient_pcm(seconds=0.5)) + base = max(int(np.max(np.abs(np.frombuffer(mx.read(), dtype=np.int16)))) + for _ in range(10)) + tone = (np.sin(2 * np.pi * 440 * np.arange(int(48000 * 0.4)) / 48000) + * 20000).astype(np.int16) + stereo = np.repeat(tone[:, None], 2, axis=1).reshape(-1).tobytes() + mx.play_speech(stereo, fade_in_ms=0) + assert mx.speech_active + speech_peak = max(int(np.max(np.abs(np.frombuffer(mx.read(), dtype=np.int16)))) + for _ in range(15)) + assert speech_peak > base + # Drain past speech + release ramp; speech_active clears. + for _ in range(40): + mx.read() + assert not mx.speech_active + + def test_clipping_prevents_int16_wraparound(self): + mx = vm.VoiceMixer() + loud = (np.ones(vm.SAMPLES_PER_FRAME * 2) * 30000).astype(np.int16).tobytes() + mx.play_speech(loud, fade_in_ms=0) + mx.play_speech(loud, fade_in_ms=0) + out = np.frombuffer(mx.read(), dtype=np.int16) + assert int(out.max()) == 32767 # clamped, not wrapped to negative + assert int(out.min()) >= -32768 + + def test_stop_speech_clears_in_flight(self): + mx = vm.VoiceMixer() + tone = (np.ones(48000) * 10000).astype(np.int16) + stereo = np.repeat(tone[:, None], 2, axis=1).reshape(-1).tobytes() + mx.play_speech(stereo) + assert mx.speech_active + mx.stop_speech() + mx.read() + assert not mx.speech_active + + def test_set_ambient_none_clears(self): + mx = vm.VoiceMixer() + mx.set_ambient(vm.synth_ambient_pcm(seconds=0.5)) + mx.set_ambient(None) + # No ambient, no speech -> silence. + assert mx.read() == vm.SILENCE_FRAME + + def test_cleanup_silences(self): + mx = vm.VoiceMixer() + mx.set_ambient(vm.synth_ambient_pcm(seconds=0.5)) + mx.cleanup() + assert mx.read() == vm.SILENCE_FRAME + + def test_pcm_not_frame_aligned_is_padded(self): + # Odd-length PCM must be padded to whole frames (no IndexError, no click). + mx = vm.VoiceMixer() + mx.play_speech(b"\x01\x02\x03", fade_in_ms=0) # 3 bytes << one frame + out = mx.read() + assert len(out) == vm.FRAME_SIZE + + def test_synth_ambient_is_stereo_and_frame_aligned(self): + pcm = vm.synth_ambient_pcm(seconds=1.0) + assert len(pcm) % (vm.CHANNELS * vm.SAMPLE_WIDTH) == 0 + assert len(pcm) % vm.FRAME_SIZE == 0 + + +# ===================================================================== +# Adapter integration +# ===================================================================== + +def _make_adapter(fx_cfg=None): + from plugins.platforms.discord.adapter import DiscordAdapter + from gateway.config import Platform, PlatformConfig + config = PlatformConfig(enabled=True, extra={}) + config.token = "fake-token" + adapter = object.__new__(DiscordAdapter) + adapter.platform = Platform.DISCORD + adapter.config = config + adapter._client = MagicMock() + adapter._voice_clients = {} + adapter._voice_locks = {} + adapter._voice_text_channels = {} + adapter._voice_sources = {} + adapter._voice_timeout_tasks = {} + adapter._voice_receivers = {} + adapter._voice_listen_tasks = {} + adapter._voice_mixers = {} + adapter._ambient_pcm_cache = None + adapter._voice_fx_cfg = fx_cfg if fx_cfg is not None else { + "enabled": True, "ambient_enabled": True, "ambient_path": "", + "ambient_gain": 0.18, "duck_gain": 0.06, "speech_gain": 1.0, + "ack_enabled": True, "ack_phrases": ["One moment."], + } + return adapter + + +class TestVoiceMixerActive: + def test_false_when_no_mixer(self): + adapter = _make_adapter() + assert adapter.voice_mixer_active(111) is False + + def test_true_when_mixer_present(self): + adapter = _make_adapter() + adapter._voice_mixers[111] = object() + assert adapter.voice_mixer_active(111) is True + + def test_false_when_attr_missing(self): + # Defensive getattr path (object.__new__ helper that forgot the attr). + from plugins.platforms.discord.adapter import DiscordAdapter + from gateway.config import Platform + bare = object.__new__(DiscordAdapter) + bare.platform = Platform.DISCORD + assert bare.voice_mixer_active(111) is False + + +class TestPlayInVoiceChannelMixerPath: + @pytest.mark.asyncio + async def test_routes_through_mixer_when_present(self): + adapter = _make_adapter() + vc = MagicMock() + vc.is_connected.return_value = True + adapter._voice_clients[111] = vc + + # speech_active returns True once (so play_speech is observed) then + # False so the wait loop exits promptly. + class _Mixer: + def __init__(self): + self._polls = 0 + self.play_speech = MagicMock() + + @property + def speech_active(self): + self._polls += 1 + return self._polls <= 1 + + mixer = _Mixer() + adapter._voice_mixers[111] = mixer + adapter._reset_voice_timeout = MagicMock() + + fake_pcm = b"\x00" * vm.FRAME_SIZE + with patch.object(vm, "decode_to_pcm", return_value=fake_pcm): + ok = await adapter.play_in_voice_channel(111, "/tmp/x.mp3") + assert ok is True + mixer.play_speech.assert_called_once() + # Legacy path must NOT have been used. + vc.play.assert_not_called() + + @pytest.mark.asyncio + async def test_falls_back_when_decode_fails(self): + adapter = _make_adapter() + vc = MagicMock() + vc.is_connected.return_value = True + vc.is_playing.return_value = False + adapter._voice_clients[111] = vc + adapter._voice_mixers[111] = MagicMock() + adapter._reset_voice_timeout = MagicMock() + adapter._voice_receivers[111] = MagicMock() + + with patch.object(vm, "decode_to_pcm", return_value=None), \ + patch("plugins.platforms.discord.adapter.discord") as mock_discord: + mock_discord.FFmpegPCMAudio.return_value = MagicMock() + mock_discord.PCMVolumeTransformer.return_value = MagicMock() + + # Make the legacy wait loop resolve immediately without leaving the + # real Event.wait() coroutine unawaited. + async def _fast(coro, *a, **k): + if hasattr(coro, "close"): + coro.close() + return None + with patch("asyncio.wait_for", _fast): + ok = await adapter.play_in_voice_channel(111, "/tmp/x.mp3") + # Fell through to legacy path -> vc.play called. + assert vc.play.called + + +class TestPlayAckInVoice: + @pytest.mark.asyncio + async def test_noop_when_ack_disabled(self): + adapter = _make_adapter({"ack_enabled": False}) + adapter._voice_mixers[111] = MagicMock() + assert await adapter.play_ack_in_voice(111) is False + + @pytest.mark.asyncio + async def test_noop_when_no_mixer(self): + adapter = _make_adapter() + assert await adapter.play_ack_in_voice(111) is False + + @pytest.mark.asyncio + async def test_plays_speech_when_armed(self, tmp_path): + adapter = _make_adapter() + mixer = MagicMock() + adapter._voice_mixers[111] = mixer + adapter._reset_voice_timeout = MagicMock() + + ack_file = tmp_path / "ack.mp3" + ack_file.write_bytes(b"id3") + import json as _json + with patch("tools.tts_tool.text_to_speech_tool", + return_value=_json.dumps({"success": True, "file_path": str(ack_file)})), \ + patch.object(vm, "decode_to_pcm", return_value=b"\x00" * vm.FRAME_SIZE): + ok = await adapter.play_ack_in_voice(111, phrase="Testing one two.") + assert ok is True + mixer.play_speech.assert_called_once() diff --git a/website/docs/user-guide/messaging/discord.md b/website/docs/user-guide/messaging/discord.md index 60b3cacd61c..6ffa44db6c5 100644 --- a/website/docs/user-guide/messaging/discord.md +++ b/website/docs/user-guide/messaging/discord.md @@ -683,6 +683,37 @@ For the full setup and operational guide, see: - [Voice Mode](/user-guide/features/voice-mode) - [Use Voice Mode with Hermes](/guides/use-voice-mode-with-hermes) +### Voice Channel Audio Effects (ambient + verbal acks) + +When the bot is in a voice channel, you can give it a more conversational feel: a short verbal acknowledgement ("let me look into that") before it starts working, and a subtle ambient "thinking" bed that plays underneath while tools run — the speech ducks the ambient down and swells it back when finished, similar to Grok voice mode. + +discord.py plays only one audio stream per connection, so Hermes installs a software mixer on the outgoing stream that sums an ambient loop, acknowledgements, and TTS replies into that single stream — they overlap instead of cutting each other off. + +This is **off by default**. Enable it in `config.yaml`: + +```yaml +discord: + voice_fx: + enabled: true # master switch + ambient_enabled: true # idle "thinking" bed while tools run + ambient_path: "" # custom loop file (any audio format); "" = built-in synthesised pad + ambient_gain: 0.18 # idle bed loudness (0.0–1.0) + duck_gain: 0.06 # ambient loudness while the bot is speaking + speech_gain: 1.0 # TTS / acknowledgement loudness + ack_enabled: true # speak a short phrase before the first tool call of a turn + ack_phrases: # picked at random; set to [] to disable the spoken ack + - "Let me look into that." + - "One moment." + - "Checking on that now." +``` + +Notes: +- The acknowledgement fires at most once per turn, only when the bot is in a voice channel and the mixer is active. It uses your configured TTS provider. +- `ambient_path` accepts any file `ffmpeg` can decode; it's looped seamlessly. Leave it empty to use the built-in synthesised pad (no asset needed). +- All settings live in `config.yaml` (not `.env`) — they're behavioral, not secrets. +- When `voice_fx.enabled` is `false`, voice playback uses the original one-shot path and nothing changes. + + ## Forum Channels Discord forum channels (type 15) don't accept direct messages — every post in a forum must be a thread. Hermes auto-detects forum channels and creates a new thread post whenever it needs to send there, so `send_message`, TTS, images, voice messages, and file attachments all work without special handling from the agent.