mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-13 09:01:54 +00:00
## What does this PR do? The voice-during-active-run feature (#41984) changed `_enrich_message_with_transcription` so that it returns a `(enriched_text, successful_transcripts)` tuple instead of a bare string, which lets callers echo the raw transcript back to the user. The signature and every other return path were updated to match, but one branch was missed: when a successfully transcribed clip arrives with the Discord "empty content" placeholder as its caption, the method still returned the prefix string on its own. All four call sites unpack the result with `text, transcripts = await self._enrich_message_with_transcription(...)`, so that path raised `ValueError: too many values to unpack (expected 2)` and the inbound voice message was dropped instead of reaching the agent. This is a real user-facing path rather than a corner case: a Discord voice note sent without a caption is delivered as exactly that placeholder, so a captionless voice message that transcribed correctly would crash the handler precisely when transcription had worked. The fix returns the proper tuple from that branch so the placeholder is still stripped while the transcripts continue to flow back to the caller for the echo. ## Related Issue N/A ## Type of Change - [x] 🐛 Bug fix (non-breaking change that fixes an issue) - [ ] ✨ New feature (non-breaking change that adds functionality) - [ ] 🔒 Security fix - [ ] 📝 Documentation update - [ ] ✅ Tests (adding or improving test coverage) - [ ] ♻️ Refactor (no behavior change) - [ ] 🎯 New skill (bundled or hub) ## Changes Made - `gateway/run.py`: in `_enrich_message_with_transcription`, return `(prefix, successful_transcripts)` instead of a bare `prefix` from the empty-content-placeholder branch, so the contract matches the signature and the other return paths. - `tests/gateway/test_stt_config.py`: add `test_enrich_message_with_transcription_returns_tuple_for_empty_content_placeholder`, which drives a successful transcription with the placeholder caption and asserts the placeholder is stripped while the transcript is still returned. ## How to Test 1. Check out `main` and run the new test — it fails with `ValueError: too many values to unpack (expected 2)`, reproducing the crash a captionless Discord voice note would trigger. 2. Apply this change and re-run `pytest tests/gateway/test_stt_config.py -q` — all tests pass. 3. `ruff check gateway/run.py tests/gateway/test_stt_config.py` and `python scripts/check-windows-footguns.py gateway/run.py tests/gateway/test_stt_config.py` both pass. ## Checklist ### Code - [x] I've read the [Contributing Guide](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md) - [x] My commit messages follow [Conventional Commits](https://www.conventionalcommits.org/) (`fix(scope):`, `feat(scope):`, etc.) - [x] I searched for [existing PRs](https://github.com/NousResearch/hermes-agent/pulls) to make sure this isn't a duplicate - [x] My PR contains **only** changes related to this fix/feature (no unrelated commits) - [x] I've run `pytest tests/ -q` and all tests pass - [x] I've added tests for my changes (required for bug fixes, strongly encouraged for features) - [x] I've tested on my platform: macOS 15 (Darwin 25.5) ### Documentation & Housekeeping - [x] I've updated relevant documentation (README, `docs/`, docstrings) — or N/A - [x] I've updated `cli-config.yaml.example` if I added/changed config keys — or N/A - [x] I've updated `CONTRIBUTING.md` or `AGENTS.md` if I changed architecture or workflows — or N/A - [x] I've considered cross-platform impact (Windows, macOS) per the [compatibility guide](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#cross-platform-compatibility) — or N/A - [x] I've updated tool descriptions/schemas if I changed tool behavior — or N/A
184 lines
6.1 KiB
Python
184 lines
6.1 KiB
Python
"""Gateway STT config tests — honor stt.enabled: false from config.yaml."""
|
|
|
|
from pathlib import Path
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from gateway.config import GatewayConfig, Platform, load_gateway_config
|
|
from gateway.platforms.base import MessageEvent, MessageType
|
|
from gateway.session import SessionSource
|
|
|
|
|
|
def test_gateway_config_stt_disabled_from_dict_nested():
|
|
config = GatewayConfig.from_dict({"stt": {"enabled": False}})
|
|
assert config.stt_enabled is False
|
|
|
|
|
|
def test_load_gateway_config_bridges_stt_enabled_from_config_yaml(tmp_path, monkeypatch):
|
|
hermes_home = tmp_path / ".hermes"
|
|
hermes_home.mkdir()
|
|
(hermes_home / "config.yaml").write_text(
|
|
yaml.dump({"stt": {"enabled": False}}),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
|
|
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
|
|
|
config = load_gateway_config()
|
|
|
|
assert config.stt_enabled is False
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_message_with_transcription_surfaces_path_when_stt_disabled():
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = GatewayRunner.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(stt_enabled=False)
|
|
runner._has_setup_skill = lambda: True # Should NOT be consulted in disabled branch.
|
|
|
|
with patch(
|
|
"tools.transcription_tools.transcribe_audio",
|
|
side_effect=AssertionError("transcribe_audio should not be called when STT is disabled"),
|
|
), patch(
|
|
"gateway.run._probe_audio_duration",
|
|
new=AsyncMock(return_value="0:12"),
|
|
):
|
|
result, transcripts = await runner._enrich_message_with_transcription(
|
|
"caption",
|
|
["/tmp/voice.ogg"],
|
|
)
|
|
|
|
assert "/tmp/voice.ogg" in result
|
|
assert "voice message" in result.lower()
|
|
assert "(duration: 0:12)" in result
|
|
assert "caption" in result
|
|
assert transcripts == []
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_message_with_transcription_omits_duration_on_probe_failure():
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = GatewayRunner.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(stt_enabled=False)
|
|
|
|
with patch(
|
|
"gateway.run._probe_audio_duration",
|
|
new=AsyncMock(return_value=None),
|
|
):
|
|
result, transcripts = await runner._enrich_message_with_transcription(
|
|
"",
|
|
["/tmp/voice.ogg"],
|
|
)
|
|
|
|
assert "/tmp/voice.ogg" in result
|
|
assert "duration" not in result.lower()
|
|
assert transcripts == []
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_message_with_transcription_avoids_bogus_no_provider_message_for_backend_key_errors():
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = GatewayRunner.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(stt_enabled=True)
|
|
|
|
with patch(
|
|
"tools.transcription_tools.transcribe_audio",
|
|
return_value={"success": False, "error": "VOICE_TOOLS_OPENAI_KEY not set"},
|
|
):
|
|
result, transcripts = await runner._enrich_message_with_transcription(
|
|
"caption",
|
|
["/tmp/voice.ogg"],
|
|
)
|
|
|
|
assert "No STT provider is configured" not in result
|
|
assert "trouble transcribing" in result
|
|
assert "caption" in result
|
|
assert transcripts == []
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_message_with_transcription_returns_tuple_for_empty_content_placeholder():
|
|
"""A successful transcription whose caption is the empty-content placeholder
|
|
must still return the ``(text, transcripts)`` tuple.
|
|
|
|
The Discord adapter delivers a captionless voice note as the literal
|
|
``"(The user sent a message with no text content)"`` placeholder. When STT
|
|
succeeds we strip that redundant placeholder and return just the transcript
|
|
prefix — but the method's contract (and every caller, which unpacks the
|
|
result as ``text, transcripts = ...``) requires a 2-tuple. Returning a bare
|
|
string here raised ``ValueError: too many values to unpack`` and dropped the
|
|
whole voice message on the floor.
|
|
"""
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = GatewayRunner.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(stt_enabled=True)
|
|
runner._has_setup_skill = lambda: False
|
|
|
|
with patch(
|
|
"tools.transcription_tools.transcribe_audio",
|
|
return_value={
|
|
"success": True,
|
|
"transcript": "hello from a captionless voice note",
|
|
"provider": "local_command",
|
|
},
|
|
):
|
|
result, transcripts = await runner._enrich_message_with_transcription(
|
|
"(The user sent a message with no text content)",
|
|
["/tmp/voice.ogg"],
|
|
)
|
|
|
|
# The redundant placeholder is stripped, leaving only the transcript prefix.
|
|
assert "hello from a captionless voice note" in result
|
|
assert "(The user sent a message with no text content)" not in result
|
|
# Crucially, the transcripts are still surfaced so callers can echo them.
|
|
assert transcripts == ["hello from a captionless voice note"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_prepare_inbound_message_text_transcribes_queued_voice_event():
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = GatewayRunner.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(stt_enabled=True)
|
|
runner.adapters = {}
|
|
runner._model = "test-model"
|
|
runner._base_url = ""
|
|
runner._has_setup_skill = lambda: False
|
|
|
|
source = SessionSource(
|
|
platform=Platform.TELEGRAM,
|
|
chat_id="123",
|
|
chat_type="dm",
|
|
)
|
|
event = MessageEvent(
|
|
text="",
|
|
message_type=MessageType.VOICE,
|
|
source=source,
|
|
media_urls=["/tmp/queued-voice.ogg"],
|
|
media_types=["audio/ogg"],
|
|
)
|
|
|
|
with patch(
|
|
"tools.transcription_tools.transcribe_audio",
|
|
return_value={
|
|
"success": True,
|
|
"transcript": "queued voice transcript",
|
|
"provider": "local_command",
|
|
},
|
|
):
|
|
result = await runner._prepare_inbound_message_text(
|
|
event=event,
|
|
source=source,
|
|
history=[],
|
|
)
|
|
|
|
assert result is not None
|
|
assert "queued voice transcript" in result
|
|
assert "voice message" in result.lower()
|