diff --git a/apps/desktop/src/app/settings/constants.ts b/apps/desktop/src/app/settings/constants.ts index 4d0e11b2822..1cf7cf3ce16 100644 --- a/apps/desktop/src/app/settings/constants.ts +++ b/apps/desktop/src/app/settings/constants.ts @@ -240,9 +240,37 @@ export const ENUM_OPTIONS: Record = { 'context.engine': ['compressor', 'default', 'custom'], 'delegation.reasoning_effort': ['', 'minimal', 'low', 'medium', 'high', 'xhigh'], 'memory.provider': ['', 'builtin', 'honcho'], + // Terminal execution backends — kept in sync with the dispatch ladder in + // tools/terminal_tool.py::_create_environment (local/docker/singularity/ + // modal/daytona/ssh). Remote backends need extra env (image, tokens, host). + 'terminal.backend': ['local', 'docker', 'singularity', 'modal', 'daytona', 'ssh'], 'stt.elevenlabs.model_id': ['scribe_v2', 'scribe_v1'], 'stt.local.model': ['tiny', 'base', 'small', 'medium', 'large-v3'], + // Speech-to-text backends — kept in sync with the stt block in + // hermes_cli/config.py (local/groq/openai/mistral/elevenlabs). + 'stt.provider': ['local', 'groq', 'openai', 'mistral', 'xai', 'elevenlabs'], 'tts.openai.voice': ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'], + // Text-to-speech backends — kept in sync with the built-in source of truth + // (agent/tts_registry.py::_BUILTIN_NAMES / tools/tts_tool.py:: + // BUILTIN_TTS_PROVIDERS). 'xai' is Grok TTS. + 'tts.provider': [ + 'edge', + 'elevenlabs', + 'openai', + 'xai', + 'minimax', + 'mistral', + 'gemini', + 'neutts', + 'kittentts', + 'piper' + ], + 'stt.openai.model': ['whisper-1', 'gpt-4o-mini-transcribe', 'gpt-4o-transcribe'], + 'stt.mistral.model': ['voxtral-mini-latest', 'voxtral-mini-2602'], + 'tts.openai.model': ['gpt-4o-mini-tts', 'tts-1', 'tts-1-hd'], + 'tts.elevenlabs.model_id': ['eleven_multilingual_v2', 'eleven_turbo_v2_5', 'eleven_flash_v2_5'], + // NeuTTS local inference device. + 'tts.neutts.device': ['cpu', 'cuda', 'mps'], 'updates.non_interactive_local_changes': ['stash', 'discard'] } @@ -268,7 +296,11 @@ export const FIELD_LABELS: Record = defineFieldCopy({ backend: 'Execution Backend', timeout: 'Command Timeout', persistentShell: 'Persistent Shell', - envPassthrough: 'Environment Passthrough' + envPassthrough: 'Environment Passthrough', + dockerImage: 'Docker Image', + singularityImage: 'Singularity Image', + modalImage: 'Modal Image', + daytonaImage: 'Daytona Image' }, fileReadMaxChars: 'File Read Limit', toolOutput: { @@ -309,6 +341,15 @@ export const FIELD_LABELS: Record = defineFieldCopy({ model: 'Local Transcription Model', language: 'Transcription Language' }, + openai: { + model: 'OpenAI STT Model' + }, + groq: { + model: 'Groq STT Model' + }, + mistral: { + model: 'Mistral STT Model' + }, elevenlabs: { modelId: 'ElevenLabs STT Model', languageCode: 'ElevenLabs Language', @@ -328,6 +369,33 @@ export const FIELD_LABELS: Record = defineFieldCopy({ elevenlabs: { voiceId: 'ElevenLabs Voice', modelId: 'ElevenLabs Model' + }, + xai: { + voiceId: 'xAI (Grok) Voice', + language: 'xAI Language' + }, + minimax: { + model: 'MiniMax TTS Model', + voiceId: 'MiniMax Voice' + }, + mistral: { + model: 'Mistral TTS Model', + voiceId: 'Mistral Voice' + }, + gemini: { + model: 'Gemini TTS Model', + voice: 'Gemini Voice' + }, + neutts: { + model: 'NeuTTS Model', + device: 'NeuTTS Device' + }, + kittentts: { + model: 'KittenTTS Model', + voice: 'KittenTTS Voice' + }, + piper: { + voice: 'Piper Voice' } }, memory: { @@ -375,7 +443,11 @@ export const FIELD_DESCRIPTIONS: Record = defineFieldCopy({ terminal: { cwd: 'Default project folder for tool and terminal work.', persistentShell: 'Keep shell state between commands when the backend supports it.', - envPassthrough: 'Environment variables to pass into tool execution.' + envPassthrough: 'Environment variables to pass into tool execution.', + dockerImage: 'Container image used when the execution backend is Docker.', + singularityImage: 'Image used when the execution backend is Singularity.', + modalImage: 'Image used when the execution backend is Modal.', + daytonaImage: 'Image used when the execution backend is Daytona.' }, codeExecution: { mode: 'How strictly code execution is scoped to the current project.' @@ -404,6 +476,15 @@ export const FIELD_DESCRIPTIONS: Record = defineFieldCopy({ voice: { autoTts: 'Automatically speak assistant responses.' }, + tts: { + xai: { + voiceId: 'xAI voice ID (e.g. eve) or a custom voice ID.', + language: 'Spoken language code, e.g. en.' + }, + neutts: { + device: 'Local inference device for NeuTTS.' + } + }, stt: { enabled: 'Enable local or provider-backed speech transcription.', elevenlabs: { @@ -495,8 +576,24 @@ export const SECTIONS: DesktopConfigSection[] = [ 'tts.openai.voice', 'tts.elevenlabs.voice_id', 'tts.elevenlabs.model_id', + 'tts.xai.voice_id', + 'tts.xai.language', + 'tts.minimax.model', + 'tts.minimax.voice_id', + 'tts.mistral.model', + 'tts.mistral.voice_id', + 'tts.gemini.model', + 'tts.gemini.voice', + 'tts.neutts.model', + 'tts.neutts.device', + 'tts.kittentts.model', + 'tts.kittentts.voice', + 'tts.piper.voice', 'stt.local.model', 'stt.local.language', + 'stt.openai.model', + 'stt.groq.model', + 'stt.mistral.model', 'stt.elevenlabs.model_id', 'stt.elevenlabs.language_code', 'stt.elevenlabs.tag_audio_events', @@ -513,6 +610,10 @@ export const SECTIONS: DesktopConfigSection[] = [ 'toolsets', 'terminal.backend', 'terminal.timeout', + 'terminal.docker_image', + 'terminal.singularity_image', + 'terminal.modal_image', + 'terminal.daytona_image', 'tool_output.max_bytes', 'tool_output.max_lines', 'tool_output.max_line_length', diff --git a/apps/desktop/src/app/settings/helpers.test.ts b/apps/desktop/src/app/settings/helpers.test.ts index ee2377a24b1..b65d63d3296 100644 --- a/apps/desktop/src/app/settings/helpers.test.ts +++ b/apps/desktop/src/app/settings/helpers.test.ts @@ -3,7 +3,7 @@ import { describe, expect, it } from 'vitest' import type { HermesConfigRecord } from '@/types/hermes' import { defineFieldCopy, fieldCopyForSchemaKey, schemaKeyToFieldCopyKey } from './field-copy' -import { getNested, providerGroup, setNested, stripToolsetLabel, toolsetDisplayLabel } from './helpers' +import { enumOptionsFor, getNested, providerGroup, setNested, stripToolsetLabel, toolsetDisplayLabel } from './helpers' describe('settings helpers', () => { describe('defineFieldCopy', () => { @@ -135,4 +135,38 @@ describe('settings helpers', () => { expect(providerGroup('SOMETHING_RANDOM')).toBe('Other') }) }) + + describe('enumOptionsFor — backend selector dropdowns', () => { + const config: HermesConfigRecord = {} + + it('renders a dropdown for the TTS provider including xAI (Grok)', () => { + const opts = enumOptionsFor('tts.provider', 'edge', config) + expect(opts).toBeDefined() + expect(opts).toContain('xai') + expect(opts).toContain('edge') + expect(opts).toContain('elevenlabs') + }) + + it('renders a dropdown for the STT provider including xAI (Grok)', () => { + const opts = enumOptionsFor('stt.provider', 'local', config) + expect(opts).toEqual(['local', 'groq', 'openai', 'mistral', 'xai', 'elevenlabs']) + }) + + it('renders dropdowns for per-backend model/device sub-fields', () => { + expect(enumOptionsFor('stt.openai.model', 'whisper-1', config)).toContain('gpt-4o-transcribe') + expect(enumOptionsFor('tts.openai.model', 'gpt-4o-mini-tts', config)).toContain('tts-1-hd') + expect(enumOptionsFor('tts.neutts.device', 'cpu', config)).toEqual(['cpu', 'cuda', 'mps']) + }) + + it('renders a dropdown for the terminal execution backend', () => { + const opts = enumOptionsFor('terminal.backend', 'local', config) + expect(opts).toEqual(['local', 'docker', 'singularity', 'modal', 'daytona', 'ssh']) + }) + + it('appends a hand-typed value not in the known list so it stays selected', () => { + const opts = enumOptionsFor('tts.provider', 'my-custom-command-tts', config) + expect(opts).toContain('my-custom-command-tts') + expect(opts).toContain('xai') + }) + }) }) diff --git a/apps/desktop/src/i18n/ja.ts b/apps/desktop/src/i18n/ja.ts index 65a2bf6591e..625a4abdec6 100644 --- a/apps/desktop/src/i18n/ja.ts +++ b/apps/desktop/src/i18n/ja.ts @@ -239,7 +239,11 @@ export const ja = defineLocale({ backend: '実行バックエンド', timeout: 'コマンドタイムアウト', persistentShell: '永続シェル', - envPassthrough: '環境変数の引き継ぎ' + envPassthrough: '環境変数の引き継ぎ', + dockerImage: 'Docker イメージ', + singularityImage: 'Singularity イメージ', + modalImage: 'Modal イメージ', + daytonaImage: 'Daytona イメージ' }, fileReadMaxChars: 'ファイル読み取り上限', toolOutput: { @@ -280,6 +284,15 @@ export const ja = defineLocale({ model: 'ローカル文字起こしモデル', language: '文字起こし言語' }, + openai: { + model: 'OpenAI STT モデル' + }, + groq: { + model: 'Groq STT モデル' + }, + mistral: { + model: 'Mistral STT モデル' + }, elevenlabs: { modelId: 'ElevenLabs STT モデル', languageCode: 'ElevenLabs 言語', @@ -299,6 +312,33 @@ export const ja = defineLocale({ elevenlabs: { voiceId: 'ElevenLabs 音声', modelId: 'ElevenLabs モデル' + }, + xai: { + voiceId: 'xAI (Grok) 音声', + language: 'xAI 言語' + }, + minimax: { + model: 'MiniMax TTS モデル', + voiceId: 'MiniMax 音声' + }, + mistral: { + model: 'Mistral TTS モデル', + voiceId: 'Mistral 音声' + }, + gemini: { + model: 'Gemini TTS モデル', + voice: 'Gemini 音声' + }, + neutts: { + model: 'NeuTTS モデル', + device: 'NeuTTS デバイス' + }, + kittentts: { + model: 'KittenTTS モデル', + voice: 'KittenTTS 音声' + }, + piper: { + voice: 'Piper 音声' } }, memory: { diff --git a/apps/desktop/src/i18n/zh-hant.ts b/apps/desktop/src/i18n/zh-hant.ts index 76a0ea69643..c09793ccf34 100644 --- a/apps/desktop/src/i18n/zh-hant.ts +++ b/apps/desktop/src/i18n/zh-hant.ts @@ -233,7 +233,11 @@ export const zhHant = defineLocale({ backend: '執行後端', timeout: '指令逾時', persistentShell: '持久化 Shell', - envPassthrough: '環境變數傳遞' + envPassthrough: '環境變數傳遞', + dockerImage: 'Docker 映像', + singularityImage: 'Singularity 映像', + modalImage: 'Modal 映像', + daytonaImage: 'Daytona 映像' }, fileReadMaxChars: '檔案讀取上限', toolOutput: { @@ -274,6 +278,15 @@ export const zhHant = defineLocale({ model: '本機轉寫模型', language: '轉寫語言' }, + openai: { + model: 'OpenAI STT 模型' + }, + groq: { + model: 'Groq STT 模型' + }, + mistral: { + model: 'Mistral STT 模型' + }, elevenlabs: { modelId: 'ElevenLabs STT 模型', languageCode: 'ElevenLabs 語言', @@ -293,6 +306,33 @@ export const zhHant = defineLocale({ elevenlabs: { voiceId: 'ElevenLabs 語音', modelId: 'ElevenLabs 模型' + }, + xai: { + voiceId: 'xAI (Grok) 語音', + language: 'xAI 語言' + }, + minimax: { + model: 'MiniMax TTS 模型', + voiceId: 'MiniMax 語音' + }, + mistral: { + model: 'Mistral TTS 模型', + voiceId: 'Mistral 語音' + }, + gemini: { + model: 'Gemini TTS 模型', + voice: 'Gemini 語音' + }, + neutts: { + model: 'NeuTTS 模型', + device: 'NeuTTS 裝置' + }, + kittentts: { + model: 'KittenTTS 模型', + voice: 'KittenTTS 語音' + }, + piper: { + voice: 'Piper 語音' } }, memory: { diff --git a/apps/desktop/src/i18n/zh.ts b/apps/desktop/src/i18n/zh.ts index d091e505586..7eac7b467b2 100644 --- a/apps/desktop/src/i18n/zh.ts +++ b/apps/desktop/src/i18n/zh.ts @@ -311,7 +311,11 @@ export const zh: Translations = { backend: '执行后端', timeout: '命令超时', persistentShell: '持久化 Shell', - envPassthrough: '环境变量透传' + envPassthrough: '环境变量透传', + dockerImage: 'Docker 镜像', + singularityImage: 'Singularity 镜像', + modalImage: 'Modal 镜像', + daytonaImage: 'Daytona 镜像' }, fileReadMaxChars: '文件读取上限', toolOutput: { @@ -352,6 +356,15 @@ export const zh: Translations = { model: '本地转写模型', language: '转写语言' }, + openai: { + model: 'OpenAI STT 模型' + }, + groq: { + model: 'Groq STT 模型' + }, + mistral: { + model: 'Mistral STT 模型' + }, elevenlabs: { modelId: 'ElevenLabs STT 模型', languageCode: 'ElevenLabs 语言', @@ -371,6 +384,33 @@ export const zh: Translations = { elevenlabs: { voiceId: 'ElevenLabs 语音', modelId: 'ElevenLabs 模型' + }, + xai: { + voiceId: 'xAI (Grok) 语音', + language: 'xAI 语言' + }, + minimax: { + model: 'MiniMax TTS 模型', + voiceId: 'MiniMax 语音' + }, + mistral: { + model: 'Mistral TTS 模型', + voiceId: 'Mistral 语音' + }, + gemini: { + model: 'Gemini TTS 模型', + voice: 'Gemini 语音' + }, + neutts: { + model: 'NeuTTS 模型', + device: 'NeuTTS 设备' + }, + kittentts: { + model: 'KittenTTS 模型', + voice: 'KittenTTS 语音' + }, + piper: { + voice: 'Piper 语音' } }, memory: {