hermes-agent/tests/gateway/test_stt_config.py
Kristian Vastveit d55304c39f fix(gateway): transcribe voice messages during active agent runs
Salvaged from #6600 (@kristianvast) — re-scoped to the voice half only and
rebased onto current main. The cascading-interrupt hang half of the original
PR landed independently in dd0d1222a, so this carries ONLY Problem 1.

When a voice/audio message arrives while the agent is busy on the same
session, it hit the interrupt path with empty text because STT only ran after
the running-agent guard — the voice was effectively lost. Now we transcribe
audio BEFORE signaling the agent (and on the fresh-message path), echo the raw
transcript back to the user (🎙️), and _enrich_message_with_transcription
returns (text, transcripts) so callers can echo. A new
_dequeue_pending_with_transcription drives the post-agent drain the same way.

Reapplied onto _prepare_inbound_message_text (inbound enrichment was extracted
from the inline dispatch block since the original PR).

Co-authored-by: Kristian Vastveit <kristian@agrointel.no>
2026-06-08 15:16:20 +05:30

145 lines
4.4 KiB
Python

"""Gateway STT config tests — honor stt.enabled: false from config.yaml."""
from pathlib import Path
from unittest.mock import AsyncMock, patch
import pytest
import yaml
from gateway.config import GatewayConfig, Platform, load_gateway_config
from gateway.platforms.base import MessageEvent, MessageType
from gateway.session import SessionSource
def test_gateway_config_stt_disabled_from_dict_nested():
config = GatewayConfig.from_dict({"stt": {"enabled": False}})
assert config.stt_enabled is False
def test_load_gateway_config_bridges_stt_enabled_from_config_yaml(tmp_path, monkeypatch):
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir()
(hermes_home / "config.yaml").write_text(
yaml.dump({"stt": {"enabled": False}}),
encoding="utf-8",
)
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
monkeypatch.setattr(Path, "home", lambda: tmp_path)
config = load_gateway_config()
assert config.stt_enabled is False
@pytest.mark.asyncio
async def test_enrich_message_with_transcription_surfaces_path_when_stt_disabled():
from gateway.run import GatewayRunner
runner = GatewayRunner.__new__(GatewayRunner)
runner.config = GatewayConfig(stt_enabled=False)
runner._has_setup_skill = lambda: True # Should NOT be consulted in disabled branch.
with patch(
"tools.transcription_tools.transcribe_audio",
side_effect=AssertionError("transcribe_audio should not be called when STT is disabled"),
), patch(
"gateway.run._probe_audio_duration",
new=AsyncMock(return_value="0:12"),
):
result, transcripts = await runner._enrich_message_with_transcription(
"caption",
["/tmp/voice.ogg"],
)
assert "/tmp/voice.ogg" in result
assert "voice message" in result.lower()
assert "(duration: 0:12)" in result
assert "caption" in result
assert transcripts == []
@pytest.mark.asyncio
async def test_enrich_message_with_transcription_omits_duration_on_probe_failure():
from gateway.run import GatewayRunner
runner = GatewayRunner.__new__(GatewayRunner)
runner.config = GatewayConfig(stt_enabled=False)
with patch(
"gateway.run._probe_audio_duration",
new=AsyncMock(return_value=None),
):
result, transcripts = await runner._enrich_message_with_transcription(
"",
["/tmp/voice.ogg"],
)
assert "/tmp/voice.ogg" in result
assert "duration" not in result.lower()
assert transcripts == []
@pytest.mark.asyncio
async def test_enrich_message_with_transcription_avoids_bogus_no_provider_message_for_backend_key_errors():
from gateway.run import GatewayRunner
runner = GatewayRunner.__new__(GatewayRunner)
runner.config = GatewayConfig(stt_enabled=True)
with patch(
"tools.transcription_tools.transcribe_audio",
return_value={"success": False, "error": "VOICE_TOOLS_OPENAI_KEY not set"},
):
result, transcripts = await runner._enrich_message_with_transcription(
"caption",
["/tmp/voice.ogg"],
)
assert "No STT provider is configured" not in result
assert "trouble transcribing" in result
assert "caption" in result
assert transcripts == []
@pytest.mark.asyncio
async def test_prepare_inbound_message_text_transcribes_queued_voice_event():
from gateway.run import GatewayRunner
runner = GatewayRunner.__new__(GatewayRunner)
runner.config = GatewayConfig(stt_enabled=True)
runner.adapters = {}
runner._model = "test-model"
runner._base_url = ""
runner._has_setup_skill = lambda: False
source = SessionSource(
platform=Platform.TELEGRAM,
chat_id="123",
chat_type="dm",
)
event = MessageEvent(
text="",
message_type=MessageType.VOICE,
source=source,
media_urls=["/tmp/queued-voice.ogg"],
media_types=["audio/ogg"],
)
with patch(
"tools.transcription_tools.transcribe_audio",
return_value={
"success": True,
"transcript": "queued voice transcript",
"provider": "local_command",
},
):
result = await runner._prepare_inbound_message_text(
event=event,
source=source,
history=[],
)
assert result is not None
assert "queued voice transcript" in result
assert "voice message" in result.lower()