mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
feat(discord): voice-channel mixer — ambient idle bed + verbal acks that overlap TTS (#39659)
* feat(discord): voice-channel mixer — ambient idle bed + verbal acks that overlap TTS
Discord voice mode can now feel conversational: the bot speaks a short
acknowledgement before it starts working, and a subtle ambient 'thinking' bed
plays underneath while tools run, ducking under speech and swelling back — the
Grok-voice-mode feel.
discord.py plays only one audio stream per voice connection, so this adds a
software mixer (VoiceMixer, a discord.AudioSource) installed once per guild on
join. It sums an ambient loop, verbal acks, and TTS replies into that single
20ms/48kHz/stereo stream (numpy int16 add + clip), so they overlap instead of
stop-and-swap. Speech ducks the ambient gain down and releases it smoothly.
- plugins/platforms/discord/voice_mixer.py: VoiceMixer + MixerChild (gain,
loop, fade, duck/release), decode_to_pcm (ffmpeg), synth_ambient_pcm (no
asset needed — synthesised pad).
- adapter: install mixer on join, tear down on leave, route
play_in_voice_channel through the mixer (legacy one-shot path kept as
fallback), play_ack_in_voice, voice_mixer_active. Defensive getattr for the
object.__new__ test helpers.
- gateway/run.py: tool_start_callback fires a one-time verbal ack on the first
tool call of a turn when in a voice channel (independent of the text
tool-progress gate). No system-prompt or message-flow changes.
- config: discord.voice_fx.* (OFF by default; ambient/duck/speech gains, ack
phrases). All in config.yaml, not .env.
- docs + tests (mixer unit + adapter integration).
Verified: 19 new tests pass, existing voice suite green (2 pre-existing
davey-module env failures unchanged), and a real-mixer E2E confirms ambient
streams, TTS overlaps it, acks layer in, and teardown is clean.
* fix(discord): make voice mixer numpy import lazy (numpy is voice-extra-only)
numpy ships in the optional 'voice' extra, not [all,dev], so a module-level
'import numpy' broke CI test collection (and would break the always-imported
Discord adapter on any install without the voice extra). Defer numpy to the
functions that actually mix audio via _require_numpy(); guard the test module
with pytest.importorskip('numpy').
This commit is contained in:
parent
3da44dbda7
commit
8a9ded5b21
6 changed files with 952 additions and 1 deletions
|
|
@ -17024,6 +17024,47 @@ class GatewayRunner:
|
|||
last_progress_msg = [None] # Track last message for dedup
|
||||
repeat_count = [0] # How many times the same message repeated
|
||||
|
||||
# ── Discord voice "verbal ack before tool calls" ────────────────
|
||||
# When the bot is in a voice channel with the continuous mixer
|
||||
# installed (discord.voice_fx.enabled), speak a short phrase ("let me
|
||||
# look into that") over the ambient idle bed on the FIRST tool call of
|
||||
# the turn. Fires from tool_start_callback (independent of the
|
||||
# tool-progress text gate), at most once per turn. No-op on every
|
||||
# other platform / when not in a voice channel.
|
||||
_voice_ack_fired = [False]
|
||||
_voice_ack_guild: List[Optional[int]] = [None]
|
||||
if source.platform == Platform.DISCORD:
|
||||
_va = self.adapters.get(Platform.DISCORD)
|
||||
# source.chat_id is the linked text channel; resolve the guild whose
|
||||
# voice connection is bound to it (mirrors DiscordAdapter.play_tts).
|
||||
_vtc = getattr(_va, "_voice_text_channels", None)
|
||||
if isinstance(_vtc, dict) and hasattr(_va, "voice_mixer_active"):
|
||||
for _gid, _tc in _vtc.items():
|
||||
if str(_tc) == str(source.chat_id) and _va.voice_mixer_active(_gid):
|
||||
_voice_ack_guild[0] = _gid
|
||||
break
|
||||
_voice_ack_loop = asyncio.get_running_loop()
|
||||
|
||||
def voice_ack_callback(call_id, tool_name, args):
|
||||
"""tool_start_callback: speak a one-time ack in the voice channel."""
|
||||
if _voice_ack_fired[0] or _voice_ack_guild[0] is None:
|
||||
return
|
||||
if not _run_still_current():
|
||||
return
|
||||
_voice_ack_fired[0] = True
|
||||
_adapter = self.adapters.get(Platform.DISCORD)
|
||||
if _adapter is None or not hasattr(_adapter, "play_ack_in_voice"):
|
||||
return
|
||||
try:
|
||||
safe_schedule_threadsafe(
|
||||
_adapter.play_ack_in_voice(_voice_ack_guild[0]),
|
||||
_voice_ack_loop,
|
||||
logger=logger,
|
||||
log_message="voice ack scheduling error",
|
||||
)
|
||||
except Exception as _ack_err:
|
||||
logger.debug("voice ack schedule failed: %s", _ack_err)
|
||||
|
||||
# Auto-cleanup of temporary progress bubbles (Telegram + any adapter
|
||||
# that implements ``delete_message``). When enabled via
|
||||
# ``display.platforms.<platform>.cleanup_progress: true``, message IDs
|
||||
|
|
@ -17834,6 +17875,11 @@ class GatewayRunner:
|
|||
# Per-message state — callbacks and reasoning config change every
|
||||
# turn and must not be baked into the cached agent constructor.
|
||||
agent.tool_progress_callback = progress_callback if tool_progress_enabled else None
|
||||
# Discord voice verbal-ack hook (fires once per turn on first tool
|
||||
# call; armed only when in a voice channel with the mixer running).
|
||||
agent.tool_start_callback = (
|
||||
voice_ack_callback if _voice_ack_guild[0] is not None else None
|
||||
)
|
||||
agent.step_callback = _step_callback_sync if _hooks_ref.loaded_hooks else None
|
||||
agent.stream_delta_callback = _stream_delta_cb
|
||||
agent.interim_assistant_callback = _interim_assistant_cb if _want_interim_messages else None
|
||||
|
|
|
|||
|
|
@ -1841,6 +1841,28 @@ DEFAULT_CONFIG = {
|
|||
# real memory cost. Default 32 MiB matches the historical hardcoded
|
||||
# cap. Set to 0 for no cap. Env override: DISCORD_MAX_ATTACHMENT_BYTES.
|
||||
"max_attachment_bytes": 33554432,
|
||||
# Voice-channel audio effects (the continuous mixer). OFF by default.
|
||||
# When enabled, the bot installs a software mixer on the outgoing voice
|
||||
# stream so a low ambient "thinking" bed, verbal acknowledgements, and
|
||||
# TTS replies can OVERLAP (ducking the ambient under speech) instead of
|
||||
# stop-and-swap — the Grok-voice-mode feel. discord.py ships no mixer;
|
||||
# this is implemented in plugins/platforms/discord/voice_mixer.py.
|
||||
"voice_fx": {
|
||||
"enabled": False, # master switch for the mixer subsystem
|
||||
"ambient_enabled": True, # play the idle "thinking" bed while tools run
|
||||
"ambient_path": "", # custom loop audio file; "" = synthesised pad
|
||||
"ambient_gain": 0.18, # idle bed loudness, 0.0–1.0
|
||||
"duck_gain": 0.06, # ambient loudness while speech plays
|
||||
"speech_gain": 1.0, # TTS / ack loudness, 0.0–1.0
|
||||
"ack_enabled": True, # speak a short phrase before the first tool call
|
||||
"ack_phrases": [ # picked at random; set [] to disable phrases
|
||||
"Let me look into that.",
|
||||
"One moment.",
|
||||
"Checking on that now.",
|
||||
"Give me a sec.",
|
||||
"On it.",
|
||||
],
|
||||
},
|
||||
},
|
||||
|
||||
# WhatsApp platform settings (gateway mode)
|
||||
|
|
|
|||
|
|
@ -600,6 +600,12 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
self._voice_listen_tasks: Dict[int, asyncio.Task] = {} # guild_id -> listen loop
|
||||
self._voice_input_callback: Optional[Callable] = None # set by run.py
|
||||
self._on_voice_disconnect: Optional[Callable] = None # set by run.py
|
||||
# Phase 3: continuous voice mixer (ambient idle bed + ducked speech).
|
||||
# Installed once per guild on join; lets acks / TTS / the "thinking"
|
||||
# loop overlap in one outgoing stream instead of stop-and-swap.
|
||||
self._voice_mixers: Dict[int, Any] = {} # guild_id -> VoiceMixer
|
||||
self._ambient_pcm_cache: Optional[bytes] = None # decoded ambient bed
|
||||
self._voice_fx_cfg: Dict[str, Any] = self._load_voice_fx_config()
|
||||
# Track threads where the bot has participated so follow-up messages
|
||||
# in those threads don't require @mention. Persisted to disk so the
|
||||
# set survives gateway restarts.
|
||||
|
|
@ -1925,6 +1931,160 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
# Voice channel methods (join / leave / play)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _load_voice_fx_config(self) -> Dict[str, Any]:
|
||||
"""Read voice mixer / ambient / ack settings from config.yaml.
|
||||
|
||||
All settings live under ``discord.voice_fx`` in config.yaml (NOT the
|
||||
.env file — these are behavioral, not secrets). The feature is OFF by
|
||||
default; users opt in with ``discord.voice_fx.enabled: true``.
|
||||
|
||||
Returns a dict with safe defaults so callers never KeyError.
|
||||
"""
|
||||
defaults: Dict[str, Any] = {
|
||||
"enabled": False, # master switch for the mixer subsystem
|
||||
"ambient_enabled": True, # idle "thinking" bed while tools run
|
||||
"ambient_path": "", # optional custom loop file; "" = synthesised
|
||||
"ambient_gain": 0.18, # idle bed loudness (0..1)
|
||||
"duck_gain": 0.06, # ambient loudness while speech plays
|
||||
"speech_gain": 1.0, # TTS / ack loudness
|
||||
"ack_enabled": True, # speak a short phrase before tool calls
|
||||
"ack_phrases": [
|
||||
"Let me look into that.",
|
||||
"One moment.",
|
||||
"Checking on that now.",
|
||||
"Give me a sec.",
|
||||
"On it.",
|
||||
],
|
||||
}
|
||||
try:
|
||||
from hermes_cli.config import read_raw_config
|
||||
cfg = read_raw_config() or {}
|
||||
fx = ((cfg.get("discord") or {}).get("voice_fx") or {})
|
||||
if isinstance(fx, dict):
|
||||
for k, v in fx.items():
|
||||
if k in defaults and v is not None:
|
||||
defaults[k] = v
|
||||
except Exception as e:
|
||||
logger.debug("Could not load discord.voice_fx config: %s", e)
|
||||
return defaults
|
||||
|
||||
def _get_ambient_pcm(self) -> Optional[bytes]:
|
||||
"""Return decoded 48k/stereo/s16le PCM for the ambient idle bed.
|
||||
|
||||
Uses a custom file when ``ambient_path`` is set and decodable, else a
|
||||
synthesised pad. Cached after first build.
|
||||
"""
|
||||
if self._ambient_pcm_cache is not None:
|
||||
return self._ambient_pcm_cache
|
||||
if not self._voice_fx_cfg.get("ambient_enabled"):
|
||||
return None
|
||||
try:
|
||||
from voice_mixer import decode_to_pcm, synth_ambient_pcm
|
||||
except ImportError:
|
||||
from .voice_mixer import decode_to_pcm, synth_ambient_pcm
|
||||
|
||||
pcm: Optional[bytes] = None
|
||||
path = (self._voice_fx_cfg.get("ambient_path") or "").strip()
|
||||
if path and os.path.isfile(path):
|
||||
pcm = decode_to_pcm(path)
|
||||
if not pcm:
|
||||
logger.warning("Ambient file %s failed to decode; using synth bed", path)
|
||||
if not pcm:
|
||||
pcm = synth_ambient_pcm()
|
||||
self._ambient_pcm_cache = pcm
|
||||
return pcm
|
||||
|
||||
async def _install_voice_mixer(self, guild_id: int, vc) -> None:
|
||||
"""Create a VoiceMixer, start the ambient bed, and play it on the VC.
|
||||
|
||||
The mixer runs continuously for the life of the connection: one
|
||||
``vc.play(mixer)`` call, never stopped until leave.
|
||||
"""
|
||||
try:
|
||||
from voice_mixer import VoiceMixer
|
||||
except ImportError:
|
||||
from .voice_mixer import VoiceMixer
|
||||
|
||||
mixer = VoiceMixer(
|
||||
ambient_gain=float(self._voice_fx_cfg.get("ambient_gain", 0.18)),
|
||||
duck_gain=float(self._voice_fx_cfg.get("duck_gain", 0.06)),
|
||||
speech_gain=float(self._voice_fx_cfg.get("speech_gain", 1.0)),
|
||||
)
|
||||
ambient = await asyncio.to_thread(self._get_ambient_pcm)
|
||||
if ambient:
|
||||
mixer.set_ambient(ambient)
|
||||
|
||||
def _after(error):
|
||||
if error:
|
||||
logger.error("Voice mixer stream error (guild=%d): %s", guild_id, error)
|
||||
|
||||
if vc.is_playing():
|
||||
vc.stop()
|
||||
vc.play(mixer, after=_after)
|
||||
self._voice_mixers[guild_id] = mixer
|
||||
logger.info("Voice mixer installed (guild=%d, ambient=%s)", guild_id, bool(ambient))
|
||||
|
||||
async def play_ack_in_voice(self, guild_id: int, phrase: Optional[str] = None) -> bool:
|
||||
"""Speak a short acknowledgement over the ambient bed.
|
||||
|
||||
Called from the gateway's tool-progress hook on the first tool call of
|
||||
a turn, so the user hears "let me look into that" before the bot goes
|
||||
quiet to work. No-op unless the mixer is installed and acks enabled.
|
||||
"""
|
||||
if not self._voice_fx_cfg.get("ack_enabled"):
|
||||
return False
|
||||
mixer = self._voice_mixers.get(guild_id)
|
||||
if mixer is None:
|
||||
return False
|
||||
if phrase is None:
|
||||
import random
|
||||
phrases = self._voice_fx_cfg.get("ack_phrases") or ["One moment."]
|
||||
phrase = random.choice(phrases)
|
||||
|
||||
# Synthesise the ack via the configured TTS provider, then layer it.
|
||||
import uuid as _uuid
|
||||
audio_path = os.path.join(
|
||||
tempfile.gettempdir(), "hermes_voice",
|
||||
f"ack_{_uuid.uuid4().hex[:12]}.mp3",
|
||||
)
|
||||
os.makedirs(os.path.dirname(audio_path), exist_ok=True)
|
||||
try:
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
result_json = await asyncio.to_thread(
|
||||
text_to_speech_tool, text=phrase, output_path=audio_path
|
||||
)
|
||||
result = json.loads(result_json)
|
||||
actual = result.get("file_path", audio_path)
|
||||
if not result.get("success") or not os.path.isfile(actual):
|
||||
return False
|
||||
try:
|
||||
from voice_mixer import decode_to_pcm
|
||||
except ImportError:
|
||||
from .voice_mixer import decode_to_pcm
|
||||
pcm = await asyncio.to_thread(decode_to_pcm, actual)
|
||||
if not pcm:
|
||||
return False
|
||||
mixer.play_speech(
|
||||
pcm, gain=float(self._voice_fx_cfg.get("speech_gain", 1.0))
|
||||
)
|
||||
self._reset_voice_timeout(guild_id)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.debug("play_ack_in_voice failed: %s", e)
|
||||
return False
|
||||
finally:
|
||||
for p in {audio_path, locals().get("actual")}:
|
||||
if p and os.path.isfile(p):
|
||||
try:
|
||||
os.unlink(p)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def voice_mixer_active(self, guild_id: int) -> bool:
|
||||
"""True when a continuous mixer is installed for this guild."""
|
||||
mixers = getattr(self, "_voice_mixers", None)
|
||||
return bool(mixers) and mixers.get(guild_id) is not None
|
||||
|
||||
async def join_voice_channel(self, channel) -> bool:
|
||||
"""Join a Discord voice channel. Returns True on success."""
|
||||
if not self._client or not DISCORD_AVAILABLE:
|
||||
|
|
@ -1957,6 +2117,15 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
except Exception as e:
|
||||
logger.warning("Voice receiver failed to start: %s", e)
|
||||
|
||||
# Phase 3: install the continuous mixer (ambient bed + ducked
|
||||
# speech). Best-effort — if it fails we fall back to the legacy
|
||||
# one-shot FFmpegPCMAudio playback path in play_in_voice_channel.
|
||||
if getattr(self, "_voice_fx_cfg", {}).get("enabled"):
|
||||
try:
|
||||
await self._install_voice_mixer(guild_id, vc)
|
||||
except Exception as e:
|
||||
logger.warning("Voice mixer failed to start: %s", e)
|
||||
|
||||
return True
|
||||
|
||||
async def leave_voice_channel(self, guild_id: int) -> None:
|
||||
|
|
@ -1970,8 +2139,17 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
if listen_task:
|
||||
listen_task.cancel()
|
||||
|
||||
# Tear down the mixer (stops the continuous outgoing stream).
|
||||
if getattr(self, "_voice_mixers", None) is not None:
|
||||
self._voice_mixers.pop(guild_id, None)
|
||||
|
||||
vc = self._voice_clients.pop(guild_id, None)
|
||||
if vc and vc.is_connected():
|
||||
try:
|
||||
if vc.is_playing():
|
||||
vc.stop()
|
||||
except Exception:
|
||||
pass
|
||||
await vc.disconnect()
|
||||
task = self._voice_timeout_tasks.pop(guild_id, None)
|
||||
if task:
|
||||
|
|
@ -1983,11 +2161,43 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
PLAYBACK_TIMEOUT = 120
|
||||
|
||||
async def play_in_voice_channel(self, guild_id: int, audio_path: str) -> bool:
|
||||
"""Play an audio file in the connected voice channel."""
|
||||
"""Play an audio file in the connected voice channel.
|
||||
|
||||
When the continuous mixer is installed for this guild, the clip is
|
||||
decoded to PCM and layered over the ambient bed (ducking it) so the
|
||||
reply can overlap the idle "thinking" loop seamlessly. Otherwise we
|
||||
fall back to the legacy one-shot FFmpegPCMAudio path.
|
||||
"""
|
||||
vc = self._voice_clients.get(guild_id)
|
||||
if not vc or not vc.is_connected():
|
||||
return False
|
||||
|
||||
# ── Mixer path (overlap + ducking) ──────────────────────────────
|
||||
mixer = getattr(self, "_voice_mixers", {}).get(guild_id) if getattr(self, "_voice_mixers", None) else None
|
||||
if mixer is not None:
|
||||
try:
|
||||
from voice_mixer import decode_to_pcm
|
||||
except ImportError:
|
||||
from .voice_mixer import decode_to_pcm
|
||||
pcm = await asyncio.to_thread(decode_to_pcm, audio_path)
|
||||
if pcm:
|
||||
speech_gain = float(self._voice_fx_cfg.get("speech_gain", 1.0))
|
||||
mixer.play_speech(pcm, gain=speech_gain)
|
||||
# Block until the speech child drains so callers serialise
|
||||
# replies (mirrors legacy semantics) but the ambient keeps
|
||||
# playing underneath the whole time.
|
||||
wait_start = time.monotonic()
|
||||
while mixer.speech_active:
|
||||
if time.monotonic() - wait_start > self.PLAYBACK_TIMEOUT:
|
||||
logger.warning("Mixer speech playback timed out after %ds", self.PLAYBACK_TIMEOUT)
|
||||
mixer.stop_speech()
|
||||
break
|
||||
await asyncio.sleep(0.05)
|
||||
self._reset_voice_timeout(guild_id)
|
||||
return True
|
||||
logger.warning("Mixer decode failed for %s; falling back to legacy playback", audio_path)
|
||||
|
||||
# ── Legacy one-shot path (no mixer) ─────────────────────────────
|
||||
# Pause voice receiver while playing (echo prevention)
|
||||
receiver = self._voice_receivers.get(guild_id)
|
||||
if receiver:
|
||||
|
|
|
|||
378
plugins/platforms/discord/voice_mixer.py
Normal file
378
plugins/platforms/discord/voice_mixer.py
Normal file
|
|
@ -0,0 +1,378 @@
|
|||
from __future__ import annotations
|
||||
|
||||
"""
|
||||
Continuous PCM audio mixer for Discord voice channels.
|
||||
|
||||
discord.py (Rapptz) ships no audio mixer: ``VoiceClient.play()`` accepts a
|
||||
single :class:`discord.AudioSource` and raises ``ClientException`` if called
|
||||
while already playing. One opus stream per connection, one source feeding it.
|
||||
|
||||
This module adds software mixing *upstream* of that single stream. A
|
||||
:class:`VoiceMixer` is itself a ``discord.AudioSource`` that discord.py polls
|
||||
every 20 ms via :meth:`read`. Internally it sums the 20 ms PCM frames of any
|
||||
number of child sources, clamps to int16, and returns one blended frame.
|
||||
discord.py never knows several streams were combined underneath — it just
|
||||
encodes and sends the single mixed frame.
|
||||
|
||||
This gives us, for one voice connection at once:
|
||||
|
||||
* an always-on low-volume **ambient/idle loop** (the "thinking" sound),
|
||||
* a **speech** channel (TTS replies, verbal acknowledgements) that plays
|
||||
*over* the ambient bed, automatically **ducking** the ambient gain down
|
||||
while speech is active and restoring it when speech ends — the smooth
|
||||
Grok-voice-mode feel, instead of stop-and-swap.
|
||||
|
||||
Design notes
|
||||
------------
|
||||
* The mixer is installed **once** per guild on join (``vc.play(mixer)``) and
|
||||
runs continuously until the bot leaves. Children come and go; the mixer
|
||||
itself never stops, so there is no ``is_playing()`` race between an
|
||||
acknowledgement and the final reply.
|
||||
* Frame format is Discord-native: 48 kHz, 2 channels, signed 16-bit LE,
|
||||
20 ms per frame == ``discord.opus.Encoder.FRAME_SIZE`` bytes
|
||||
(3840 = 960 samples * 2 channels * 2 bytes).
|
||||
* Mixing is a single vectorised int32 add + clip per 20 ms frame (numpy,
|
||||
already a core dependency). CPU cost is negligible.
|
||||
* :meth:`read` is called from discord.py's audio sender **thread**, while
|
||||
children are added/removed from the asyncio event loop thread, so all
|
||||
shared state is guarded by a plain ``threading.Lock``.
|
||||
|
||||
The mixer NEVER touches the inbound receive path: it only produces the bot's
|
||||
*outgoing* stream. The :class:`VoiceReceiver` decodes incoming SSRCs only, so
|
||||
the mixer's output cannot echo back into transcription.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
if TYPE_CHECKING: # numpy is an optional ("voice" extra) dep — never import at runtime top-level
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _require_numpy():
|
||||
"""Import numpy lazily.
|
||||
|
||||
numpy ships in the optional ``voice`` extra, not the base install, so this
|
||||
module must import cleanly without it (the Discord adapter imports this
|
||||
file unconditionally). Callers that actually mix audio call this; if the
|
||||
voice extra isn't installed they get a clear error instead of a top-level
|
||||
ImportError that would break the whole adapter import.
|
||||
"""
|
||||
import numpy as np # noqa: PLC0415 — intentional lazy import
|
||||
return np
|
||||
|
||||
# Discord-native frame geometry (matches discord.opus.Encoder).
|
||||
SAMPLE_RATE = 48000
|
||||
CHANNELS = 2
|
||||
SAMPLE_WIDTH = 2 # bytes per sample (s16)
|
||||
FRAME_LENGTH_MS = 20
|
||||
SAMPLES_PER_FRAME = SAMPLE_RATE * FRAME_LENGTH_MS // 1000 # 960
|
||||
FRAME_SIZE = SAMPLES_PER_FRAME * CHANNELS * SAMPLE_WIDTH # 3840 bytes
|
||||
SILENCE_FRAME = b"\x00" * FRAME_SIZE
|
||||
|
||||
|
||||
class MixerChild:
|
||||
"""A single audio stream feeding into :class:`VoiceMixer`.
|
||||
|
||||
Wraps raw 48 kHz / stereo / s16le PCM bytes. ``read_frame`` hands back one
|
||||
20 ms frame at a time, optionally looping, with a per-child gain applied.
|
||||
"""
|
||||
|
||||
__slots__ = (
|
||||
"name", "_pcm", "_pos", "loop", "gain",
|
||||
"is_speech", "fade_frames", "_fade_done", "_finished",
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
pcm: bytes,
|
||||
*,
|
||||
loop: bool = False,
|
||||
gain: float = 1.0,
|
||||
is_speech: bool = False,
|
||||
fade_in_ms: int = 0,
|
||||
):
|
||||
# Pad to a whole number of frames so looping is seamless and the final
|
||||
# partial frame doesn't click.
|
||||
remainder = len(pcm) % FRAME_SIZE
|
||||
if remainder:
|
||||
pcm = pcm + b"\x00" * (FRAME_SIZE - remainder)
|
||||
self.name = name
|
||||
self._pcm = pcm
|
||||
self._pos = 0
|
||||
self.loop = loop
|
||||
self.gain = float(gain)
|
||||
self.is_speech = is_speech
|
||||
# Linear fade-in over N frames avoids a click when a loud child starts.
|
||||
self.fade_frames = max(0, fade_in_ms // FRAME_LENGTH_MS)
|
||||
self._fade_done = 0
|
||||
self._finished = False
|
||||
|
||||
@property
|
||||
def finished(self) -> bool:
|
||||
return self._finished
|
||||
|
||||
def read_frame(self) -> "Optional[np.ndarray]":
|
||||
"""Return the next 20 ms frame as an int16 ndarray, or None if done."""
|
||||
if self._finished:
|
||||
return None
|
||||
if self._pos >= len(self._pcm):
|
||||
if self.loop and self._pcm:
|
||||
self._pos = 0
|
||||
else:
|
||||
self._finished = True
|
||||
return None
|
||||
|
||||
np = _require_numpy()
|
||||
chunk = self._pcm[self._pos:self._pos + FRAME_SIZE]
|
||||
self._pos += FRAME_SIZE
|
||||
if len(chunk) < FRAME_SIZE:
|
||||
chunk = chunk + b"\x00" * (FRAME_SIZE - len(chunk))
|
||||
|
||||
samples = np.frombuffer(chunk, dtype=np.int16).astype(np.float32)
|
||||
|
||||
gain = self.gain
|
||||
if self.fade_frames and self._fade_done < self.fade_frames:
|
||||
self._fade_done += 1
|
||||
gain *= self._fade_done / self.fade_frames
|
||||
|
||||
if gain != 1.0:
|
||||
samples = samples * gain
|
||||
return samples
|
||||
|
||||
|
||||
class VoiceMixer:
|
||||
"""A continuous ``discord.AudioSource`` that mixes N child streams.
|
||||
|
||||
Use :meth:`set_ambient` to install/replace the looping idle bed and
|
||||
:meth:`play_speech` to layer a one-shot clip over it (ducking the ambient
|
||||
while it plays). Both are safe to call from the asyncio loop thread while
|
||||
discord.py drains :meth:`read` from its sender thread.
|
||||
"""
|
||||
|
||||
# discord.AudioSource subclasses set is_opus()==False to receive PCM.
|
||||
def is_opus(self) -> bool: # pragma: no cover - trivial
|
||||
return False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
ambient_gain: float = 0.18,
|
||||
duck_gain: float = 0.06,
|
||||
speech_gain: float = 1.0,
|
||||
duck_release_ms: int = 400,
|
||||
):
|
||||
self._lock = threading.Lock()
|
||||
self._ambient: Optional[MixerChild] = None
|
||||
self._speech: List[MixerChild] = []
|
||||
self._ambient_gain = float(ambient_gain)
|
||||
self._duck_gain = float(duck_gain)
|
||||
self._speech_gain = float(speech_gain)
|
||||
# When speech ends, ramp the ambient back up over this many frames
|
||||
# instead of jumping, so the bed swells back smoothly.
|
||||
self._duck_release_frames = max(1, duck_release_ms // FRAME_LENGTH_MS)
|
||||
self._duck_release_left = 0
|
||||
self._closed = False
|
||||
# Tracks whether speech is currently active, for external callers that
|
||||
# want to avoid double-ducking or know when a reply is mid-flight.
|
||||
self._speech_active = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Ambient (idle / "thinking") bed
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def set_ambient(self, pcm: Optional[bytes], *, gain: Optional[float] = None) -> None:
|
||||
"""Install (or clear, with ``pcm=None``) the looping ambient bed."""
|
||||
with self._lock:
|
||||
if gain is not None:
|
||||
self._ambient_gain = float(gain)
|
||||
if not pcm:
|
||||
self._ambient = None
|
||||
return
|
||||
self._ambient = MixerChild(
|
||||
"ambient", pcm, loop=True,
|
||||
gain=self._effective_ambient_gain(), fade_in_ms=200,
|
||||
)
|
||||
|
||||
def _effective_ambient_gain(self) -> float:
|
||||
return self._duck_gain if self._speech_active else self._ambient_gain
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Speech (TTS replies, verbal acks) layered over the ambient bed
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def play_speech(self, pcm: bytes, *, gain: Optional[float] = None,
|
||||
fade_in_ms: int = 40) -> None:
|
||||
"""Layer a one-shot speech clip over the ambient bed (ducks ambient)."""
|
||||
if not pcm:
|
||||
return
|
||||
with self._lock:
|
||||
child = MixerChild(
|
||||
"speech", pcm, loop=False,
|
||||
gain=self._speech_gain if gain is None else float(gain),
|
||||
is_speech=True, fade_in_ms=fade_in_ms,
|
||||
)
|
||||
self._speech.append(child)
|
||||
self._speech_active = True
|
||||
self._duck_release_left = 0
|
||||
if self._ambient is not None:
|
||||
self._ambient.gain = self._duck_gain
|
||||
|
||||
@property
|
||||
def speech_active(self) -> bool:
|
||||
with self._lock:
|
||||
return self._speech_active
|
||||
|
||||
def stop_speech(self) -> None:
|
||||
"""Drop any in-flight speech immediately and release the duck."""
|
||||
with self._lock:
|
||||
self._speech.clear()
|
||||
self._begin_duck_release_locked()
|
||||
|
||||
def _begin_duck_release_locked(self) -> None:
|
||||
self._speech_active = False
|
||||
self._duck_release_left = self._duck_release_frames
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# AudioSource interface — called from discord.py's sender thread
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Return one 20 ms mixed PCM frame (always FRAME_SIZE bytes).
|
||||
|
||||
Returning a non-empty frame keeps discord.py's player alive; we never
|
||||
return b"" because that would stop the single underlying stream and we
|
||||
want the mixer to run continuously for the lifetime of the connection.
|
||||
"""
|
||||
with self._lock:
|
||||
if self._closed:
|
||||
return SILENCE_FRAME
|
||||
|
||||
np = _require_numpy()
|
||||
acc: "Optional[np.ndarray]" = None
|
||||
|
||||
# Speech children (drop exhausted ones; release duck when last ends)
|
||||
if self._speech:
|
||||
still_live: List[MixerChild] = []
|
||||
for child in self._speech:
|
||||
frame = child.read_frame()
|
||||
if frame is None:
|
||||
continue
|
||||
acc = frame if acc is None else acc + frame
|
||||
still_live.append(child)
|
||||
self._speech = still_live
|
||||
if not self._speech and self._speech_active:
|
||||
self._begin_duck_release_locked()
|
||||
|
||||
# Ambient bed — ramp gain back up during duck-release.
|
||||
if self._ambient is not None:
|
||||
if self._duck_release_left > 0 and not self._speech_active:
|
||||
self._duck_release_left -= 1
|
||||
frac = 1.0 - (self._duck_release_left / self._duck_release_frames)
|
||||
self._ambient.gain = (
|
||||
self._duck_gain
|
||||
+ (self._ambient_gain - self._duck_gain) * frac
|
||||
)
|
||||
elif not self._speech_active and self._duck_release_left == 0:
|
||||
self._ambient.gain = self._ambient_gain
|
||||
amb = self._ambient.read_frame()
|
||||
if amb is not None:
|
||||
acc = amb if acc is None else acc + amb
|
||||
|
||||
if acc is None:
|
||||
return SILENCE_FRAME
|
||||
|
||||
np.clip(acc, -32768, 32767, out=acc)
|
||||
return acc.astype(np.int16).tobytes()
|
||||
|
||||
def cleanup(self) -> None: # called by discord.py when playback stops
|
||||
with self._lock:
|
||||
self._closed = True
|
||||
self._ambient = None
|
||||
self._speech.clear()
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# PCM helpers
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
def decode_to_pcm(path: str, *, timeout: float = 30.0) -> Optional[bytes]:
|
||||
"""Decode any audio file to 48 kHz / stereo / s16le PCM via ffmpeg.
|
||||
|
||||
Returns the raw PCM bytes, or None on failure. ffmpeg is already a hard
|
||||
requirement of the voice path (see ``VoiceReceiver.pcm_to_wav``).
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
[
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-i", path,
|
||||
"-f", "s16le",
|
||||
"-ar", str(SAMPLE_RATE),
|
||||
"-ac", str(CHANNELS),
|
||||
"pipe:1",
|
||||
],
|
||||
capture_output=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e:
|
||||
logger.warning("decode_to_pcm failed for %s: %s", path, e)
|
||||
return None
|
||||
if proc.returncode != 0:
|
||||
logger.warning(
|
||||
"ffmpeg decode failed for %s (rc=%d): %s",
|
||||
path, proc.returncode, (proc.stderr or b"").decode("utf-8", "replace")[:200],
|
||||
)
|
||||
return None
|
||||
return proc.stdout or None
|
||||
|
||||
|
||||
def synth_ambient_pcm(seconds: float = 4.0) -> bytes:
|
||||
"""Synthesise a subtle looping ambient bed (no asset file required).
|
||||
|
||||
A soft, slowly-pulsing low pad: two detuned sine partials with a gentle
|
||||
tremolo, plus a touch of filtered noise. Designed to loop seamlessly
|
||||
(whole number of cycles, zero-crossing endpoints) and sit quietly under
|
||||
speech. Mono content duplicated to stereo.
|
||||
"""
|
||||
np = _require_numpy()
|
||||
n = int(SAMPLE_RATE * seconds)
|
||||
t = np.arange(n, dtype=np.float64) / SAMPLE_RATE
|
||||
|
||||
# Choose base frequencies that complete whole cycles over the loop so the
|
||||
# wrap point is click-free.
|
||||
def _whole_cycle_freq(target: float) -> float:
|
||||
cycles = max(1, round(target * seconds))
|
||||
return cycles / seconds
|
||||
|
||||
f1 = _whole_cycle_freq(110.0)
|
||||
f2 = _whole_cycle_freq(110.5)
|
||||
trem = _whole_cycle_freq(0.5) # ~0.5 Hz tremolo
|
||||
|
||||
pad = (
|
||||
0.55 * np.sin(2 * np.pi * f1 * t)
|
||||
+ 0.45 * np.sin(2 * np.pi * f2 * t)
|
||||
)
|
||||
tremolo = 0.6 + 0.4 * (0.5 * (1 + np.sin(2 * np.pi * trem * t)))
|
||||
signal = pad * tremolo
|
||||
|
||||
# Smooth filtered noise for air, kept very low.
|
||||
rng = np.random.default_rng(7)
|
||||
noise = rng.standard_normal(n)
|
||||
kernel = np.ones(64) / 64.0
|
||||
noise = np.convolve(noise, kernel, mode="same")
|
||||
signal = signal + 0.08 * noise
|
||||
|
||||
# Normalise to a modest peak (mixer applies the real ambient gain on top).
|
||||
peak = float(np.max(np.abs(signal))) or 1.0
|
||||
signal = (signal / peak) * 0.5
|
||||
|
||||
mono16 = (signal * 32767.0).astype(np.int16)
|
||||
stereo16 = np.repeat(mono16[:, None], CHANNELS, axis=1).reshape(-1)
|
||||
return stereo16.tobytes()
|
||||
264
tests/gateway/test_discord_voice_mixer.py
Normal file
264
tests/gateway/test_discord_voice_mixer.py
Normal file
|
|
@ -0,0 +1,264 @@
|
|||
"""Tests for the Discord continuous voice mixer (ambient + ducked speech)
|
||||
and the verbal-ack-before-tool-calls hook.
|
||||
|
||||
The mixer (plugins/platforms/discord/voice_mixer.py) is pure-PCM and has no
|
||||
discord.py dependency, so its core is tested directly. The adapter
|
||||
integration (install on join, play routing, ack) is tested with the standard
|
||||
``object.__new__(DiscordAdapter)`` helper used elsewhere in the voice suite.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
# numpy ships only in the optional "voice" extra (not [all,dev]); the mixer
|
||||
# math needs it, so skip this whole module when it isn't installed.
|
||||
np = pytest.importorskip("numpy")
|
||||
|
||||
# voice_mixer lives inside the discord plugin package dir; import by path the
|
||||
# same way the adapter does.
|
||||
_DISCORD_DIR = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
|
||||
"plugins", "platforms", "discord",
|
||||
)
|
||||
if _DISCORD_DIR not in sys.path:
|
||||
sys.path.insert(0, _DISCORD_DIR)
|
||||
|
||||
import voice_mixer as vm # noqa: E402
|
||||
|
||||
|
||||
# =====================================================================
|
||||
# Pure mixer unit tests
|
||||
# =====================================================================
|
||||
|
||||
class TestVoiceMixerCore:
|
||||
def test_frame_geometry_matches_discord(self):
|
||||
# 20ms @ 48kHz stereo s16 == 3840 bytes (discord.opus.Encoder.FRAME_SIZE)
|
||||
assert vm.FRAME_SIZE == 3840
|
||||
assert vm.SAMPLES_PER_FRAME == 960
|
||||
assert len(vm.SILENCE_FRAME) == vm.FRAME_SIZE
|
||||
|
||||
def test_empty_mixer_returns_silence_frames(self):
|
||||
mx = vm.VoiceMixer()
|
||||
for _ in range(5):
|
||||
frame = mx.read()
|
||||
assert len(frame) == vm.FRAME_SIZE
|
||||
assert frame == vm.SILENCE_FRAME
|
||||
|
||||
def test_is_opus_false(self):
|
||||
# discord.py sends raw PCM when is_opus() is False.
|
||||
assert vm.VoiceMixer().is_opus() is False
|
||||
|
||||
def test_ambient_loops_and_is_quiet(self):
|
||||
mx = vm.VoiceMixer(ambient_gain=0.2)
|
||||
amb = vm.synth_ambient_pcm(seconds=0.5)
|
||||
assert len(amb) % vm.FRAME_SIZE == 0 # frame-aligned for seamless loop
|
||||
mx.set_ambient(amb)
|
||||
peaks = [int(np.max(np.abs(np.frombuffer(mx.read(), dtype=np.int16))))
|
||||
for _ in range(100)] # 2s >> 0.5s loop
|
||||
# Produces audio after the fade-in and stays under the configured gain.
|
||||
assert any(p > 0 for p in peaks[10:])
|
||||
assert max(peaks) < int(32767 * 0.5)
|
||||
|
||||
def test_speech_audible_over_ambient_then_releases(self):
|
||||
mx = vm.VoiceMixer(ambient_gain=0.2, duck_gain=0.05, duck_release_ms=200)
|
||||
mx.set_ambient(vm.synth_ambient_pcm(seconds=0.5))
|
||||
base = max(int(np.max(np.abs(np.frombuffer(mx.read(), dtype=np.int16))))
|
||||
for _ in range(10))
|
||||
tone = (np.sin(2 * np.pi * 440 * np.arange(int(48000 * 0.4)) / 48000)
|
||||
* 20000).astype(np.int16)
|
||||
stereo = np.repeat(tone[:, None], 2, axis=1).reshape(-1).tobytes()
|
||||
mx.play_speech(stereo, fade_in_ms=0)
|
||||
assert mx.speech_active
|
||||
speech_peak = max(int(np.max(np.abs(np.frombuffer(mx.read(), dtype=np.int16))))
|
||||
for _ in range(15))
|
||||
assert speech_peak > base
|
||||
# Drain past speech + release ramp; speech_active clears.
|
||||
for _ in range(40):
|
||||
mx.read()
|
||||
assert not mx.speech_active
|
||||
|
||||
def test_clipping_prevents_int16_wraparound(self):
|
||||
mx = vm.VoiceMixer()
|
||||
loud = (np.ones(vm.SAMPLES_PER_FRAME * 2) * 30000).astype(np.int16).tobytes()
|
||||
mx.play_speech(loud, fade_in_ms=0)
|
||||
mx.play_speech(loud, fade_in_ms=0)
|
||||
out = np.frombuffer(mx.read(), dtype=np.int16)
|
||||
assert int(out.max()) == 32767 # clamped, not wrapped to negative
|
||||
assert int(out.min()) >= -32768
|
||||
|
||||
def test_stop_speech_clears_in_flight(self):
|
||||
mx = vm.VoiceMixer()
|
||||
tone = (np.ones(48000) * 10000).astype(np.int16)
|
||||
stereo = np.repeat(tone[:, None], 2, axis=1).reshape(-1).tobytes()
|
||||
mx.play_speech(stereo)
|
||||
assert mx.speech_active
|
||||
mx.stop_speech()
|
||||
mx.read()
|
||||
assert not mx.speech_active
|
||||
|
||||
def test_set_ambient_none_clears(self):
|
||||
mx = vm.VoiceMixer()
|
||||
mx.set_ambient(vm.synth_ambient_pcm(seconds=0.5))
|
||||
mx.set_ambient(None)
|
||||
# No ambient, no speech -> silence.
|
||||
assert mx.read() == vm.SILENCE_FRAME
|
||||
|
||||
def test_cleanup_silences(self):
|
||||
mx = vm.VoiceMixer()
|
||||
mx.set_ambient(vm.synth_ambient_pcm(seconds=0.5))
|
||||
mx.cleanup()
|
||||
assert mx.read() == vm.SILENCE_FRAME
|
||||
|
||||
def test_pcm_not_frame_aligned_is_padded(self):
|
||||
# Odd-length PCM must be padded to whole frames (no IndexError, no click).
|
||||
mx = vm.VoiceMixer()
|
||||
mx.play_speech(b"\x01\x02\x03", fade_in_ms=0) # 3 bytes << one frame
|
||||
out = mx.read()
|
||||
assert len(out) == vm.FRAME_SIZE
|
||||
|
||||
def test_synth_ambient_is_stereo_and_frame_aligned(self):
|
||||
pcm = vm.synth_ambient_pcm(seconds=1.0)
|
||||
assert len(pcm) % (vm.CHANNELS * vm.SAMPLE_WIDTH) == 0
|
||||
assert len(pcm) % vm.FRAME_SIZE == 0
|
||||
|
||||
|
||||
# =====================================================================
|
||||
# Adapter integration
|
||||
# =====================================================================
|
||||
|
||||
def _make_adapter(fx_cfg=None):
|
||||
from plugins.platforms.discord.adapter import DiscordAdapter
|
||||
from gateway.config import Platform, PlatformConfig
|
||||
config = PlatformConfig(enabled=True, extra={})
|
||||
config.token = "fake-token"
|
||||
adapter = object.__new__(DiscordAdapter)
|
||||
adapter.platform = Platform.DISCORD
|
||||
adapter.config = config
|
||||
adapter._client = MagicMock()
|
||||
adapter._voice_clients = {}
|
||||
adapter._voice_locks = {}
|
||||
adapter._voice_text_channels = {}
|
||||
adapter._voice_sources = {}
|
||||
adapter._voice_timeout_tasks = {}
|
||||
adapter._voice_receivers = {}
|
||||
adapter._voice_listen_tasks = {}
|
||||
adapter._voice_mixers = {}
|
||||
adapter._ambient_pcm_cache = None
|
||||
adapter._voice_fx_cfg = fx_cfg if fx_cfg is not None else {
|
||||
"enabled": True, "ambient_enabled": True, "ambient_path": "",
|
||||
"ambient_gain": 0.18, "duck_gain": 0.06, "speech_gain": 1.0,
|
||||
"ack_enabled": True, "ack_phrases": ["One moment."],
|
||||
}
|
||||
return adapter
|
||||
|
||||
|
||||
class TestVoiceMixerActive:
|
||||
def test_false_when_no_mixer(self):
|
||||
adapter = _make_adapter()
|
||||
assert adapter.voice_mixer_active(111) is False
|
||||
|
||||
def test_true_when_mixer_present(self):
|
||||
adapter = _make_adapter()
|
||||
adapter._voice_mixers[111] = object()
|
||||
assert adapter.voice_mixer_active(111) is True
|
||||
|
||||
def test_false_when_attr_missing(self):
|
||||
# Defensive getattr path (object.__new__ helper that forgot the attr).
|
||||
from plugins.platforms.discord.adapter import DiscordAdapter
|
||||
from gateway.config import Platform
|
||||
bare = object.__new__(DiscordAdapter)
|
||||
bare.platform = Platform.DISCORD
|
||||
assert bare.voice_mixer_active(111) is False
|
||||
|
||||
|
||||
class TestPlayInVoiceChannelMixerPath:
|
||||
@pytest.mark.asyncio
|
||||
async def test_routes_through_mixer_when_present(self):
|
||||
adapter = _make_adapter()
|
||||
vc = MagicMock()
|
||||
vc.is_connected.return_value = True
|
||||
adapter._voice_clients[111] = vc
|
||||
|
||||
# speech_active returns True once (so play_speech is observed) then
|
||||
# False so the wait loop exits promptly.
|
||||
class _Mixer:
|
||||
def __init__(self):
|
||||
self._polls = 0
|
||||
self.play_speech = MagicMock()
|
||||
|
||||
@property
|
||||
def speech_active(self):
|
||||
self._polls += 1
|
||||
return self._polls <= 1
|
||||
|
||||
mixer = _Mixer()
|
||||
adapter._voice_mixers[111] = mixer
|
||||
adapter._reset_voice_timeout = MagicMock()
|
||||
|
||||
fake_pcm = b"\x00" * vm.FRAME_SIZE
|
||||
with patch.object(vm, "decode_to_pcm", return_value=fake_pcm):
|
||||
ok = await adapter.play_in_voice_channel(111, "/tmp/x.mp3")
|
||||
assert ok is True
|
||||
mixer.play_speech.assert_called_once()
|
||||
# Legacy path must NOT have been used.
|
||||
vc.play.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_falls_back_when_decode_fails(self):
|
||||
adapter = _make_adapter()
|
||||
vc = MagicMock()
|
||||
vc.is_connected.return_value = True
|
||||
vc.is_playing.return_value = False
|
||||
adapter._voice_clients[111] = vc
|
||||
adapter._voice_mixers[111] = MagicMock()
|
||||
adapter._reset_voice_timeout = MagicMock()
|
||||
adapter._voice_receivers[111] = MagicMock()
|
||||
|
||||
with patch.object(vm, "decode_to_pcm", return_value=None), \
|
||||
patch("plugins.platforms.discord.adapter.discord") as mock_discord:
|
||||
mock_discord.FFmpegPCMAudio.return_value = MagicMock()
|
||||
mock_discord.PCMVolumeTransformer.return_value = MagicMock()
|
||||
|
||||
# Make the legacy wait loop resolve immediately without leaving the
|
||||
# real Event.wait() coroutine unawaited.
|
||||
async def _fast(coro, *a, **k):
|
||||
if hasattr(coro, "close"):
|
||||
coro.close()
|
||||
return None
|
||||
with patch("asyncio.wait_for", _fast):
|
||||
ok = await adapter.play_in_voice_channel(111, "/tmp/x.mp3")
|
||||
# Fell through to legacy path -> vc.play called.
|
||||
assert vc.play.called
|
||||
|
||||
|
||||
class TestPlayAckInVoice:
|
||||
@pytest.mark.asyncio
|
||||
async def test_noop_when_ack_disabled(self):
|
||||
adapter = _make_adapter({"ack_enabled": False})
|
||||
adapter._voice_mixers[111] = MagicMock()
|
||||
assert await adapter.play_ack_in_voice(111) is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_noop_when_no_mixer(self):
|
||||
adapter = _make_adapter()
|
||||
assert await adapter.play_ack_in_voice(111) is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_plays_speech_when_armed(self, tmp_path):
|
||||
adapter = _make_adapter()
|
||||
mixer = MagicMock()
|
||||
adapter._voice_mixers[111] = mixer
|
||||
adapter._reset_voice_timeout = MagicMock()
|
||||
|
||||
ack_file = tmp_path / "ack.mp3"
|
||||
ack_file.write_bytes(b"id3")
|
||||
import json as _json
|
||||
with patch("tools.tts_tool.text_to_speech_tool",
|
||||
return_value=_json.dumps({"success": True, "file_path": str(ack_file)})), \
|
||||
patch.object(vm, "decode_to_pcm", return_value=b"\x00" * vm.FRAME_SIZE):
|
||||
ok = await adapter.play_ack_in_voice(111, phrase="Testing one two.")
|
||||
assert ok is True
|
||||
mixer.play_speech.assert_called_once()
|
||||
|
|
@ -683,6 +683,37 @@ For the full setup and operational guide, see:
|
|||
- [Voice Mode](/user-guide/features/voice-mode)
|
||||
- [Use Voice Mode with Hermes](/guides/use-voice-mode-with-hermes)
|
||||
|
||||
### Voice Channel Audio Effects (ambient + verbal acks)
|
||||
|
||||
When the bot is in a voice channel, you can give it a more conversational feel: a short verbal acknowledgement ("let me look into that") before it starts working, and a subtle ambient "thinking" bed that plays underneath while tools run — the speech ducks the ambient down and swells it back when finished, similar to Grok voice mode.
|
||||
|
||||
discord.py plays only one audio stream per connection, so Hermes installs a software mixer on the outgoing stream that sums an ambient loop, acknowledgements, and TTS replies into that single stream — they overlap instead of cutting each other off.
|
||||
|
||||
This is **off by default**. Enable it in `config.yaml`:
|
||||
|
||||
```yaml
|
||||
discord:
|
||||
voice_fx:
|
||||
enabled: true # master switch
|
||||
ambient_enabled: true # idle "thinking" bed while tools run
|
||||
ambient_path: "" # custom loop file (any audio format); "" = built-in synthesised pad
|
||||
ambient_gain: 0.18 # idle bed loudness (0.0–1.0)
|
||||
duck_gain: 0.06 # ambient loudness while the bot is speaking
|
||||
speech_gain: 1.0 # TTS / acknowledgement loudness
|
||||
ack_enabled: true # speak a short phrase before the first tool call of a turn
|
||||
ack_phrases: # picked at random; set to [] to disable the spoken ack
|
||||
- "Let me look into that."
|
||||
- "One moment."
|
||||
- "Checking on that now."
|
||||
```
|
||||
|
||||
Notes:
|
||||
- The acknowledgement fires at most once per turn, only when the bot is in a voice channel and the mixer is active. It uses your configured TTS provider.
|
||||
- `ambient_path` accepts any file `ffmpeg` can decode; it's looped seamlessly. Leave it empty to use the built-in synthesised pad (no asset needed).
|
||||
- All settings live in `config.yaml` (not `.env`) — they're behavioral, not secrets.
|
||||
- When `voice_fx.enabled` is `false`, voice playback uses the original one-shot path and nothing changes.
|
||||
|
||||
|
||||
## Forum Channels
|
||||
|
||||
Discord forum channels (type 15) don't accept direct messages — every post in a forum must be a thread. Hermes auto-detects forum channels and creates a new thread post whenever it needs to send there, so `send_message`, TTS, images, voice messages, and file attachments all work without special handling from the agent.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue