From dfc2fd887e5022103fc46eea4dcb93413dc77315 Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Sat, 30 May 2026 19:35:34 -0500 Subject: [PATCH] fix: tts endpoints --- hermes_cli/web_server.py | 135 +++++++++++++++++++++++++++- tests/hermes_cli/test_web_server.py | 85 ++++++++++++++++++ 2 files changed, 217 insertions(+), 3 deletions(-) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index eee4710bbc3..903b026ae1e 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -927,6 +927,132 @@ async def transcribe_audio_upload(payload: AudioTranscriptionRequest): } +class TTSSpeakRequest(BaseModel): + text: str + + +def _elevenlabs_voice_label(voice: Dict[str, Any]) -> str: + name = str(voice.get("name") or voice.get("voice_id") or "Voice").strip() + category = str(voice.get("category") or "").strip() + + return f"{name} ({category})" if category else name + + +@app.get("/api/audio/elevenlabs/voices") +async def get_elevenlabs_voices(): + """Return ElevenLabs voices when an API key is configured. + + The desktop UI uses this for the ``tts.elevenlabs.voice_id`` dropdown. + Only non-secret voice metadata is returned; the API key stays server-side. + """ + api_key = (load_env().get("ELEVENLABS_API_KEY") or os.environ.get("ELEVENLABS_API_KEY") or "").strip() + if not api_key: + return {"available": False, "voices": []} + + request = urllib.request.Request( + "https://api.elevenlabs.io/v1/voices", + headers={ + "Accept": "application/json", + "xi-api-key": api_key, + }, + ) + + try: + loop = asyncio.get_running_loop() + + def _fetch() -> Dict[str, Any]: + with urllib.request.urlopen(request, timeout=10) as response: + return json.loads(response.read().decode("utf-8")) + + payload = await loop.run_in_executor(None, _fetch) + except Exception as exc: + _log.warning("ElevenLabs voice list failed: %s", exc) + raise HTTPException(status_code=502, detail="Could not load ElevenLabs voices") + + voices = [] + for voice in payload.get("voices") or []: + if not isinstance(voice, dict): + continue + + voice_id = str(voice.get("voice_id") or "").strip() + if not voice_id: + continue + + voices.append({ + "voice_id": voice_id, + "name": str(voice.get("name") or voice_id), + "label": _elevenlabs_voice_label(voice), + }) + + voices.sort(key=lambda item: str(item.get("label") or "").lower()) + return {"available": True, "voices": voices} + + +@app.post("/api/audio/speak") +async def speak_text(payload: TTSSpeakRequest): + """Synthesize speech and return audio as base64 data URL. + + Used by the desktop voice-conversation mode to play back assistant + responses without exposing the on-disk file path. Reuses the + existing TTS provider chain (Edge / OpenAI / ElevenLabs / etc.) + configured in ``~/.hermes/config.yaml`` under ``tts.``. + """ + text = (payload.text or "").strip() + if not text: + raise HTTPException(status_code=400, detail="Text is required") + + try: + from tools.tts_tool import text_to_speech_tool + loop = asyncio.get_running_loop() + result_json = await loop.run_in_executor(None, text_to_speech_tool, text) + except Exception as exc: + _log.exception("Desktop voice TTS failed") + raise HTTPException(status_code=500, detail=f"Speech synthesis failed: {exc}") + + try: + result = json.loads(result_json) if isinstance(result_json, str) else result_json + except Exception: + raise HTTPException(status_code=500, detail="Invalid TTS response") + + if not result.get("success"): + raise HTTPException( + status_code=400, + detail=result.get("error") or "Speech synthesis failed", + ) + + file_path = result.get("file_path") + if not file_path or not os.path.isfile(file_path): + raise HTTPException(status_code=500, detail="Audio file missing") + + ext = os.path.splitext(file_path)[1].lower() + mime_type = { + ".mp3": "audio/mpeg", + ".ogg": "audio/ogg", + ".opus": "audio/ogg", + ".wav": "audio/wav", + ".flac": "audio/flac", + }.get(ext, "audio/mpeg") + + try: + with open(file_path, "rb") as fh: + audio_bytes = fh.read() + except OSError as exc: + raise HTTPException(status_code=500, detail=f"Could not read audio: {exc}") + finally: + try: + os.unlink(file_path) + except OSError: + pass + + encoded = base64.b64encode(audio_bytes).decode("ascii") + return { + "ok": True, + "data_url": f"data:{mime_type};base64,{encoded}", + "mime_type": mime_type, + "provider": result.get("provider"), + } + + @app.get("/api/actions/{name}/status") async def get_action_status(name: str, lines: int = 200): """Tail an action log and report whether the process is still running.""" @@ -957,13 +1083,16 @@ async def get_action_status(name: str, lines: int = 200): @app.get("/api/sessions") -async def get_sessions(limit: int = 20, offset: int = 0): +async def get_sessions(limit: int = 20, offset: int = 0, min_messages: int = 0): try: from hermes_state import SessionDB db = SessionDB() try: - sessions = db.list_sessions_rich(limit=limit, offset=offset) - total = db.session_count() + min_message_count = max(0, min_messages) + sessions = db.list_sessions_rich( + limit=limit, offset=offset, min_message_count=min_message_count + ) + total = db.session_count(min_message_count=min_message_count) now = time.time() for s in sessions: s["is_active"] = ( diff --git a/tests/hermes_cli/test_web_server.py b/tests/hermes_cli/test_web_server.py index 7bc5c5185ac..29e569e6949 100644 --- a/tests/hermes_cli/test_web_server.py +++ b/tests/hermes_cli/test_web_server.py @@ -174,6 +174,37 @@ class TestWebServerEndpoints: row = next(s for s in rows if s["id"] == "session-no-cwd") assert row["cwd"] is None + def test_get_sessions_forwards_min_messages(self, monkeypatch): + """The ?min_messages= filter must reach SessionDB. + + The desktop session picker calls /api/sessions?...&min_messages=N to + hide empty sessions. The param was silently dropped from the handler + in a merge once (SessionDB still supported it); guard the wiring. + """ + captured = {} + + class _FakeDB: + def __init__(self, *args, **kwargs): + pass + + def list_sessions_rich(self, limit, offset, min_message_count=0): + captured["list"] = min_message_count + return [] + + def session_count(self, min_message_count=0): + captured["count"] = min_message_count + return 0 + + def close(self): + pass + + monkeypatch.setattr("hermes_state.SessionDB", _FakeDB) + + resp = self.client.get("/api/sessions?limit=5&offset=0&min_messages=3") + assert resp.status_code == 200 + assert captured["list"] == 3 + assert captured["count"] == 3 + def test_audio_transcription_endpoint(self, monkeypatch): import tools.transcription_tools as transcription_tools @@ -218,6 +249,60 @@ class TestWebServerEndpoints: assert resp.status_code == 400 assert "base64" in resp.json()["detail"] + def test_desktop_audio_routes_registered(self): + """All three desktop voice endpoints must exist. + + The renderer (apps/desktop) calls /api/audio/transcribe, /speak, and + /elevenlabs/voices. /speak + /voices were silently dropped in a merge + once; this guards the contract so a future merge can't lose them + without failing CI. + """ + from hermes_cli.web_server import app + + paths = {getattr(r, "path", None) for r in app.routes} + assert "/api/audio/transcribe" in paths + assert "/api/audio/speak" in paths + assert "/api/audio/elevenlabs/voices" in paths + + def test_elevenlabs_voices_unavailable_without_key(self, monkeypatch): + import hermes_cli.web_server as web_server + + monkeypatch.setattr(web_server, "load_env", lambda: {}) + monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False) + + resp = self.client.get("/api/audio/elevenlabs/voices") + assert resp.status_code == 200 + assert resp.json() == {"available": False, "voices": []} + + def test_speak_text_returns_base64_data_url(self, monkeypatch, tmp_path): + import tools.tts_tool as tts_tool + + audio_file = tmp_path / "speech.mp3" + audio_file.write_bytes(b"ID3fake-audio-bytes") + + def fake_tts(text): + return json.dumps({ + "success": True, + "file_path": str(audio_file), + "provider": "test", + }) + + monkeypatch.setattr(tts_tool, "text_to_speech_tool", fake_tts) + + resp = self.client.post("/api/audio/speak", json={"text": "hello there"}) + assert resp.status_code == 200 + body = resp.json() + assert body["ok"] is True + assert body["mime_type"] == "audio/mpeg" + assert body["data_url"].startswith("data:audio/mpeg;base64,") + assert body["provider"] == "test" + # The handler streams the bytes back and removes the temp file. + assert not audio_file.exists() + + def test_speak_text_requires_nonempty_text(self): + resp = self.client.post("/api/audio/speak", json={"text": " "}) + assert resp.status_code == 400 + def test_get_status_filters_unconfigured_gateway_platforms(self, monkeypatch): import gateway.config as gateway_config import hermes_cli.web_server as web_server