mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
Two coordinated changes that unblock downstream audio pipelines (diarization, custom transcription, archival) on attachments larger than the public Bot API's 20MB getFile ceiling. - `stt.enabled: false` no longer drops voice/audio with a generic "transcription disabled" note. The gateway probes the cached file's duration (wave → mutagen → ffprobe ladder) and surfaces `[The user sent a voice message: <abs path> (duration: M:SS)]` to the agent so a skill or tool can pick up the raw file. The previous placeholder is replaced rather than appended when present. - `platforms.telegram.extra.base_url` set → adapter auto-lifts its document size cap from 20MB to 2GB (the local telegram-bot-api `--local` ceiling) and the "too large" reply reports the active limit dynamically. No new config knob; presence of `base_url` is the opt-in. - `platforms.telegram.extra.local_mode: true` wires `Application.builder().local_mode(True)` on the python-telegram-bot builder. PTB then reads files from disk instead of HTTP, which is required when telegram-bot-api runs in `--local` mode (the server returns absolute filesystem paths, not `/file/bot...` URLs). - gateway/run.py: rewrites the `stt.enabled: false` branch of `_enrich_message_with_transcription`. New `_format_duration` + `_probe_audio_duration` helpers. - gateway/platforms/telegram.py: `_max_doc_bytes` instance attribute derived from `extra.base_url`; `local_mode` builder wiring; dynamic "too large" message. - tests/gateway/test_stt_config.py: covers path-surfacing with and without an existing user message, and placeholder replacement. - tests/gateway/test_telegram_max_doc_bytes.py: 3 cases — default 20MB without base_url, 2GB when set, empty-string base_url keeps default. - website/docs/user-guide/messaging/telegram.md: new "Skipping STT" subsection under Voice Messages and a full "Large Files (>20MB) via Local Bot API Server" walkthrough (api_id/api_hash, docker-compose, one-time `logOut` migration, `platforms.telegram.extra` config, the `local_mode` disk-access requirement, the silent HTTP-fallback 404). - website/docs/user-guide/features/voice-mode.md: documents the `stt.enabled` knob in the config reference. - `pytest tests/gateway/test_telegram_max_doc_bytes.py tests/gateway/test_stt_config.py` → 9/9 passing. - Verified end-to-end on a live deployment: gateway log shows `Using custom Telegram base_url: http://...` and `Using Telegram local_mode (read files from disk)` on startup; voice messages above 20MB cache to disk and surface their path to the agent. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
142 lines
4.3 KiB
Python
142 lines
4.3 KiB
Python
"""Gateway STT config tests — honor stt.enabled: false from config.yaml."""
|
|
|
|
from pathlib import Path
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from gateway.config import GatewayConfig, Platform, load_gateway_config
|
|
from gateway.platforms.base import MessageEvent, MessageType
|
|
from gateway.session import SessionSource
|
|
|
|
|
|
def test_gateway_config_stt_disabled_from_dict_nested():
|
|
config = GatewayConfig.from_dict({"stt": {"enabled": False}})
|
|
assert config.stt_enabled is False
|
|
|
|
|
|
def test_load_gateway_config_bridges_stt_enabled_from_config_yaml(tmp_path, monkeypatch):
|
|
hermes_home = tmp_path / ".hermes"
|
|
hermes_home.mkdir()
|
|
(hermes_home / "config.yaml").write_text(
|
|
yaml.dump({"stt": {"enabled": False}}),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
|
|
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
|
|
|
config = load_gateway_config()
|
|
|
|
assert config.stt_enabled is False
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_message_with_transcription_surfaces_path_when_stt_disabled():
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = GatewayRunner.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(stt_enabled=False)
|
|
runner._has_setup_skill = lambda: True # Should NOT be consulted in disabled branch.
|
|
|
|
with patch(
|
|
"tools.transcription_tools.transcribe_audio",
|
|
side_effect=AssertionError("transcribe_audio should not be called when STT is disabled"),
|
|
), patch(
|
|
"gateway.run._probe_audio_duration",
|
|
new=AsyncMock(return_value="0:12"),
|
|
):
|
|
result = await runner._enrich_message_with_transcription(
|
|
"caption",
|
|
["/tmp/voice.ogg"],
|
|
)
|
|
|
|
assert "/tmp/voice.ogg" in result
|
|
assert "voice message" in result.lower()
|
|
assert "(duration: 0:12)" in result
|
|
assert "caption" in result
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_message_with_transcription_omits_duration_on_probe_failure():
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = GatewayRunner.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(stt_enabled=False)
|
|
|
|
with patch(
|
|
"gateway.run._probe_audio_duration",
|
|
new=AsyncMock(return_value=None),
|
|
):
|
|
result = await runner._enrich_message_with_transcription(
|
|
"",
|
|
["/tmp/voice.ogg"],
|
|
)
|
|
|
|
assert "/tmp/voice.ogg" in result
|
|
assert "duration" not in result.lower()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_message_with_transcription_avoids_bogus_no_provider_message_for_backend_key_errors():
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = GatewayRunner.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(stt_enabled=True)
|
|
|
|
with patch(
|
|
"tools.transcription_tools.transcribe_audio",
|
|
return_value={"success": False, "error": "VOICE_TOOLS_OPENAI_KEY not set"},
|
|
):
|
|
result = await runner._enrich_message_with_transcription(
|
|
"caption",
|
|
["/tmp/voice.ogg"],
|
|
)
|
|
|
|
assert "No STT provider is configured" not in result
|
|
assert "trouble transcribing" in result
|
|
assert "caption" in result
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_prepare_inbound_message_text_transcribes_queued_voice_event():
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = GatewayRunner.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(stt_enabled=True)
|
|
runner.adapters = {}
|
|
runner._model = "test-model"
|
|
runner._base_url = ""
|
|
runner._has_setup_skill = lambda: False
|
|
|
|
source = SessionSource(
|
|
platform=Platform.TELEGRAM,
|
|
chat_id="123",
|
|
chat_type="dm",
|
|
)
|
|
event = MessageEvent(
|
|
text="",
|
|
message_type=MessageType.VOICE,
|
|
source=source,
|
|
media_urls=["/tmp/queued-voice.ogg"],
|
|
media_types=["audio/ogg"],
|
|
)
|
|
|
|
with patch(
|
|
"tools.transcription_tools.transcribe_audio",
|
|
return_value={
|
|
"success": True,
|
|
"transcript": "queued voice transcript",
|
|
"provider": "local_command",
|
|
},
|
|
):
|
|
result = await runner._prepare_inbound_message_text(
|
|
event=event,
|
|
source=source,
|
|
history=[],
|
|
)
|
|
|
|
assert result is not None
|
|
assert "queued voice transcript" in result
|
|
assert "voice message" in result.lower()
|