fix(gateway): suppress duplicate voice transcripts

Deduplicate exact and near-exact Discord voice STT transcripts per guild/user over a short window to avoid duplicate delayed agent replies. Adds regression tests for exact and near-duplicate voice transcript suppression.
2026-05-07 02:51:50 +00:00 · 2026-05-03 09:24:08 -06:00 · 2026-05-03 09:24:08 -06:00 · 1bd975c0ba
commit 1bd975c0ba
parent b58db237e4
2 changed files with 94 additions and 0 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -1161,6 +1161,10 @@ class GatewayRunner:
        # Per-chat voice reply mode: "off" | "voice_only" | "all"
        self._voice_mode: Dict[str, str] = self._load_voice_modes()
        # Recent voice transcripts per (guild,user) for duplicate suppression.
        # Protects against the same utterance being emitted twice by the voice
        # capture / STT pipeline, which otherwise produces a second delayed reply.
        self._recent_voice_transcripts: Dict[tuple[int, int], List[tuple[float, str]]] = {}
        # Track background tasks to prevent garbage collection mid-execution
        self._background_tasks: set = set()
@ -8261,6 +8265,47 @@ class GatewayRunner:
        adapter = self.adapters.get(Platform.DISCORD)
        self._set_adapter_auto_tts_disabled(adapter, chat_id, disabled=True)
    def _is_duplicate_voice_transcript(self, guild_id: int, user_id: int, transcript: str) -> bool:
        """Suppress repeated STT outputs for the same recent utterance.
        Voice capture can occasionally emit the same utterance twice a few
        seconds apart, which creates a second queued agent run and overlapping
        spoken replies. Dedup exact and near-exact repeats per guild/user over a
        short window while allowing genuinely new turns through.
        """
        from difflib import SequenceMatcher
        normalized = re.sub(r"\s+", " ", transcript).strip().lower()
        normalized = re.sub(r"[^\w\s]", "", normalized)
        if not normalized:
            return False
        now = time.monotonic()
        window_seconds = 12.0
        key = (guild_id, user_id)
        recent_store = getattr(self, "_recent_voice_transcripts", None)
        if not isinstance(recent_store, dict):
            recent_store = {}
            self._recent_voice_transcripts = recent_store
        recent = [
            (ts, txt)
            for ts, txt in recent_store.get(key, [])
            if now - ts <= window_seconds
        ]
        for _, prior in recent:
            if prior == normalized:
                recent_store[key] = recent
                return True
            if len(prior) >= 16 and len(normalized) >= 16:
                if SequenceMatcher(None, prior, normalized).ratio() >= 0.95:
                    recent_store[key] = recent
                    return True
        recent.append((now, normalized))
        recent_store[key] = recent[-5:]
        return False
    async def _handle_voice_channel_input(
        self, guild_id: int, user_id: int, transcript: str
    ):
@ -8298,6 +8343,15 @@ class GatewayRunner:
            logger.debug("Unauthorized voice input from user %d, ignoring", user_id)
            return
        if self._is_duplicate_voice_transcript(guild_id, user_id, transcript):
            logger.info(
                "Suppressing duplicate voice transcript for guild=%s user=%s: %s",
                guild_id,
                user_id,
                transcript[:100],
            )
            return
        # Show transcript in text channel (after auth, with mention sanitization)
        try:
            channel = adapter._client.get_channel(text_ch_id)
--- a/tests/gateway/test_voice_command.py
+++ b/tests/gateway/test_voice_command.py
@ -954,6 +954,46 @@ class TestVoiceChannelCommands:
        assert "Test transcript" in msg
        assert "42" in msg  # user_id in mention
    @pytest.mark.asyncio
    async def test_input_suppresses_duplicate_transcript(self, runner):
        """Near-immediate duplicate STT output should not dispatch twice."""
        from gateway.config import Platform
        mock_adapter = AsyncMock()
        mock_adapter._voice_text_channels = {111: 123}
        mock_adapter._voice_sources = {}
        mock_channel = AsyncMock()
        mock_adapter._client = MagicMock()
        mock_adapter._client.get_channel = MagicMock(return_value=mock_channel)
        mock_adapter.handle_message = AsyncMock()
        runner.adapters[Platform.DISCORD] = mock_adapter
        await runner._handle_voice_channel_input(111, 42, "Hello from VC")
        await runner._handle_voice_channel_input(111, 42, "Hello from VC")
        mock_adapter.handle_message.assert_called_once()
        mock_channel.send.assert_called_once()
    @pytest.mark.asyncio
    async def test_input_suppresses_near_duplicate_transcript(self, runner):
        """Small STT wording drift should still be treated as the same utterance."""
        from gateway.config import Platform
        mock_adapter = AsyncMock()
        mock_adapter._voice_text_channels = {111: 123}
        mock_adapter._voice_sources = {}
        mock_channel = AsyncMock()
        mock_adapter._client = MagicMock()
        mock_adapter._client.get_channel = MagicMock(return_value=mock_channel)
        mock_adapter.handle_message = AsyncMock()
        runner.adapters[Platform.DISCORD] = mock_adapter
        await runner._handle_voice_channel_input(111, 42, "This is a test of the voice system")
        await runner._handle_voice_channel_input(111, 42, "This is a test for the voice system")
        mock_adapter.handle_message.assert_called_once()
        mock_channel.send.assert_called_once()
    # -- _get_guild_id --
    def test_get_guild_id_from_guild(self, runner):