feat(desktop): full tool-backend config (pickers + per-backend settings) in Settings (#41232)

* feat(desktop): surface TTS/STT/terminal backends as Settings dropdowns

Every native tool backend that the agent supports now shows up as a
clickable picker in the desktop Settings UI instead of a free-text box.

Desktop Settings renders a config field as a <Select> only if its dotpath
is a key in ENUM_OPTIONS (helpers.ts::enumOptionsFor returns undefined ->
free-text <Input> otherwise). Three backend-selector fields were surfaced
in their sections but missing from the map, so users had to hand-type the
provider name and could reasonably assume it was unsupported:

- tts.provider — now lists all built-in TTS backends incl. xai (Grok)
- stt.provider — local/groq/openai/mistral/elevenlabs
- terminal.backend — local/docker/singularity/modal/daytona/ssh

Each list is kept in sync with its backend source of truth (TTS:
agent/tts_registry.py::_BUILTIN_NAMES + tools/tts_tool.py; STT + terminal:
hermes_cli/config.py / tools/terminal_tool.py). The existing
enumOptionsFor current-value-append keeps any hand-typed/legacy value
selected, and command-type TTS providers still work.

Reported for Grok/xAI TTS, which was already a fully-wired built-in
provider (tts.provider: xai + XAI_API_KEY) with no picker entry.

* feat(desktop): expose per-backend TTS/STT/terminal config fields in Settings

Completes the backend-coverage pass: not just the provider PICKER but every
backend's own config fields are now tunable from desktop Settings, so a user
who picks (e.g.) Grok TTS can also set its voice/language without hand-editing
config.yaml.

Also fixes the STT provider dropdown: added 'xai' (Grok STT), which the
transcription dispatcher (tools/transcription_tools.py) handles but the
config.py comment had omitted — the dispatch ladder is the source of truth.

New Settings fields (Voice section):
- TTS xai (voice_id, language), minimax (model, voice_id), mistral
  (model, voice_id), gemini (model, voice), neutts (model, device),
  kittentts (model, voice), piper (voice)
- STT openai (model), groq (model), mistral (model)

New Settings fields (Advanced section):
- terminal docker_image / singularity_image / modal_image / daytona_image

New ENUM_OPTIONS dropdowns: stt.provider (+xai), stt.openai.model,
stt.mistral.model, tts.openai.model, tts.elevenlabs.model_id,
tts.neutts.device. Each list mirrors the backend generator's accepted values
(tools/tts_tool.py, tools/transcription_tools.py, hermes_cli/config.py).

i18n: FIELD_LABELS/FIELD_DESCRIPTIONS cover all locales via the English
fallback in config-settings.tsx; added native translations to ja/zh/zh-hant.

Secrets (provider API keys, modal/daytona tokens, ssh host/key) intentionally
stay in Settings -> Keys as env vars, not duplicated as config fields.
This commit is contained in:
Teknium 2026-06-07 10:05:47 -07:00 committed by GitHub
parent 0c48b7165d
commit 20fd0bde5d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 261 additions and 6 deletions

View file

@ -240,9 +240,37 @@ export const ENUM_OPTIONS: Record<string, string[]> = {
'context.engine': ['compressor', 'default', 'custom'],
'delegation.reasoning_effort': ['', 'minimal', 'low', 'medium', 'high', 'xhigh'],
'memory.provider': ['', 'builtin', 'honcho'],
// Terminal execution backends — kept in sync with the dispatch ladder in
// tools/terminal_tool.py::_create_environment (local/docker/singularity/
// modal/daytona/ssh). Remote backends need extra env (image, tokens, host).
'terminal.backend': ['local', 'docker', 'singularity', 'modal', 'daytona', 'ssh'],
'stt.elevenlabs.model_id': ['scribe_v2', 'scribe_v1'],
'stt.local.model': ['tiny', 'base', 'small', 'medium', 'large-v3'],
// Speech-to-text backends — kept in sync with the stt block in
// hermes_cli/config.py (local/groq/openai/mistral/elevenlabs).
'stt.provider': ['local', 'groq', 'openai', 'mistral', 'xai', 'elevenlabs'],
'tts.openai.voice': ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'],
// Text-to-speech backends — kept in sync with the built-in source of truth
// (agent/tts_registry.py::_BUILTIN_NAMES / tools/tts_tool.py::
// BUILTIN_TTS_PROVIDERS). 'xai' is Grok TTS.
'tts.provider': [
'edge',
'elevenlabs',
'openai',
'xai',
'minimax',
'mistral',
'gemini',
'neutts',
'kittentts',
'piper'
],
'stt.openai.model': ['whisper-1', 'gpt-4o-mini-transcribe', 'gpt-4o-transcribe'],
'stt.mistral.model': ['voxtral-mini-latest', 'voxtral-mini-2602'],
'tts.openai.model': ['gpt-4o-mini-tts', 'tts-1', 'tts-1-hd'],
'tts.elevenlabs.model_id': ['eleven_multilingual_v2', 'eleven_turbo_v2_5', 'eleven_flash_v2_5'],
// NeuTTS local inference device.
'tts.neutts.device': ['cpu', 'cuda', 'mps'],
'updates.non_interactive_local_changes': ['stash', 'discard']
}
@ -268,7 +296,11 @@ export const FIELD_LABELS: Record<string, string> = defineFieldCopy({
backend: 'Execution Backend',
timeout: 'Command Timeout',
persistentShell: 'Persistent Shell',
envPassthrough: 'Environment Passthrough'
envPassthrough: 'Environment Passthrough',
dockerImage: 'Docker Image',
singularityImage: 'Singularity Image',
modalImage: 'Modal Image',
daytonaImage: 'Daytona Image'
},
fileReadMaxChars: 'File Read Limit',
toolOutput: {
@ -309,6 +341,15 @@ export const FIELD_LABELS: Record<string, string> = defineFieldCopy({
model: 'Local Transcription Model',
language: 'Transcription Language'
},
openai: {
model: 'OpenAI STT Model'
},
groq: {
model: 'Groq STT Model'
},
mistral: {
model: 'Mistral STT Model'
},
elevenlabs: {
modelId: 'ElevenLabs STT Model',
languageCode: 'ElevenLabs Language',
@ -328,6 +369,33 @@ export const FIELD_LABELS: Record<string, string> = defineFieldCopy({
elevenlabs: {
voiceId: 'ElevenLabs Voice',
modelId: 'ElevenLabs Model'
},
xai: {
voiceId: 'xAI (Grok) Voice',
language: 'xAI Language'
},
minimax: {
model: 'MiniMax TTS Model',
voiceId: 'MiniMax Voice'
},
mistral: {
model: 'Mistral TTS Model',
voiceId: 'Mistral Voice'
},
gemini: {
model: 'Gemini TTS Model',
voice: 'Gemini Voice'
},
neutts: {
model: 'NeuTTS Model',
device: 'NeuTTS Device'
},
kittentts: {
model: 'KittenTTS Model',
voice: 'KittenTTS Voice'
},
piper: {
voice: 'Piper Voice'
}
},
memory: {
@ -375,7 +443,11 @@ export const FIELD_DESCRIPTIONS: Record<string, string> = defineFieldCopy({
terminal: {
cwd: 'Default project folder for tool and terminal work.',
persistentShell: 'Keep shell state between commands when the backend supports it.',
envPassthrough: 'Environment variables to pass into tool execution.'
envPassthrough: 'Environment variables to pass into tool execution.',
dockerImage: 'Container image used when the execution backend is Docker.',
singularityImage: 'Image used when the execution backend is Singularity.',
modalImage: 'Image used when the execution backend is Modal.',
daytonaImage: 'Image used when the execution backend is Daytona.'
},
codeExecution: {
mode: 'How strictly code execution is scoped to the current project.'
@ -404,6 +476,15 @@ export const FIELD_DESCRIPTIONS: Record<string, string> = defineFieldCopy({
voice: {
autoTts: 'Automatically speak assistant responses.'
},
tts: {
xai: {
voiceId: 'xAI voice ID (e.g. eve) or a custom voice ID.',
language: 'Spoken language code, e.g. en.'
},
neutts: {
device: 'Local inference device for NeuTTS.'
}
},
stt: {
enabled: 'Enable local or provider-backed speech transcription.',
elevenlabs: {
@ -495,8 +576,24 @@ export const SECTIONS: DesktopConfigSection[] = [
'tts.openai.voice',
'tts.elevenlabs.voice_id',
'tts.elevenlabs.model_id',
'tts.xai.voice_id',
'tts.xai.language',
'tts.minimax.model',
'tts.minimax.voice_id',
'tts.mistral.model',
'tts.mistral.voice_id',
'tts.gemini.model',
'tts.gemini.voice',
'tts.neutts.model',
'tts.neutts.device',
'tts.kittentts.model',
'tts.kittentts.voice',
'tts.piper.voice',
'stt.local.model',
'stt.local.language',
'stt.openai.model',
'stt.groq.model',
'stt.mistral.model',
'stt.elevenlabs.model_id',
'stt.elevenlabs.language_code',
'stt.elevenlabs.tag_audio_events',
@ -513,6 +610,10 @@ export const SECTIONS: DesktopConfigSection[] = [
'toolsets',
'terminal.backend',
'terminal.timeout',
'terminal.docker_image',
'terminal.singularity_image',
'terminal.modal_image',
'terminal.daytona_image',
'tool_output.max_bytes',
'tool_output.max_lines',
'tool_output.max_line_length',

View file

@ -3,7 +3,7 @@ import { describe, expect, it } from 'vitest'
import type { HermesConfigRecord } from '@/types/hermes'
import { defineFieldCopy, fieldCopyForSchemaKey, schemaKeyToFieldCopyKey } from './field-copy'
import { getNested, providerGroup, setNested, stripToolsetLabel, toolsetDisplayLabel } from './helpers'
import { enumOptionsFor, getNested, providerGroup, setNested, stripToolsetLabel, toolsetDisplayLabel } from './helpers'
describe('settings helpers', () => {
describe('defineFieldCopy', () => {
@ -135,4 +135,38 @@ describe('settings helpers', () => {
expect(providerGroup('SOMETHING_RANDOM')).toBe('Other')
})
})
describe('enumOptionsFor — backend selector dropdowns', () => {
const config: HermesConfigRecord = {}
it('renders a dropdown for the TTS provider including xAI (Grok)', () => {
const opts = enumOptionsFor('tts.provider', 'edge', config)
expect(opts).toBeDefined()
expect(opts).toContain('xai')
expect(opts).toContain('edge')
expect(opts).toContain('elevenlabs')
})
it('renders a dropdown for the STT provider including xAI (Grok)', () => {
const opts = enumOptionsFor('stt.provider', 'local', config)
expect(opts).toEqual(['local', 'groq', 'openai', 'mistral', 'xai', 'elevenlabs'])
})
it('renders dropdowns for per-backend model/device sub-fields', () => {
expect(enumOptionsFor('stt.openai.model', 'whisper-1', config)).toContain('gpt-4o-transcribe')
expect(enumOptionsFor('tts.openai.model', 'gpt-4o-mini-tts', config)).toContain('tts-1-hd')
expect(enumOptionsFor('tts.neutts.device', 'cpu', config)).toEqual(['cpu', 'cuda', 'mps'])
})
it('renders a dropdown for the terminal execution backend', () => {
const opts = enumOptionsFor('terminal.backend', 'local', config)
expect(opts).toEqual(['local', 'docker', 'singularity', 'modal', 'daytona', 'ssh'])
})
it('appends a hand-typed value not in the known list so it stays selected', () => {
const opts = enumOptionsFor('tts.provider', 'my-custom-command-tts', config)
expect(opts).toContain('my-custom-command-tts')
expect(opts).toContain('xai')
})
})
})

View file

@ -239,7 +239,11 @@ export const ja = defineLocale({
backend: '実行バックエンド',
timeout: 'コマンドタイムアウト',
persistentShell: '永続シェル',
envPassthrough: '環境変数の引き継ぎ'
envPassthrough: '環境変数の引き継ぎ',
dockerImage: 'Docker イメージ',
singularityImage: 'Singularity イメージ',
modalImage: 'Modal イメージ',
daytonaImage: 'Daytona イメージ'
},
fileReadMaxChars: 'ファイル読み取り上限',
toolOutput: {
@ -280,6 +284,15 @@ export const ja = defineLocale({
model: 'ローカル文字起こしモデル',
language: '文字起こし言語'
},
openai: {
model: 'OpenAI STT モデル'
},
groq: {
model: 'Groq STT モデル'
},
mistral: {
model: 'Mistral STT モデル'
},
elevenlabs: {
modelId: 'ElevenLabs STT モデル',
languageCode: 'ElevenLabs 言語',
@ -299,6 +312,33 @@ export const ja = defineLocale({
elevenlabs: {
voiceId: 'ElevenLabs 音声',
modelId: 'ElevenLabs モデル'
},
xai: {
voiceId: 'xAI (Grok) 音声',
language: 'xAI 言語'
},
minimax: {
model: 'MiniMax TTS モデル',
voiceId: 'MiniMax 音声'
},
mistral: {
model: 'Mistral TTS モデル',
voiceId: 'Mistral 音声'
},
gemini: {
model: 'Gemini TTS モデル',
voice: 'Gemini 音声'
},
neutts: {
model: 'NeuTTS モデル',
device: 'NeuTTS デバイス'
},
kittentts: {
model: 'KittenTTS モデル',
voice: 'KittenTTS 音声'
},
piper: {
voice: 'Piper 音声'
}
},
memory: {

View file

@ -233,7 +233,11 @@ export const zhHant = defineLocale({
backend: '執行後端',
timeout: '指令逾時',
persistentShell: '持久化 Shell',
envPassthrough: '環境變數傳遞'
envPassthrough: '環境變數傳遞',
dockerImage: 'Docker 映像',
singularityImage: 'Singularity 映像',
modalImage: 'Modal 映像',
daytonaImage: 'Daytona 映像'
},
fileReadMaxChars: '檔案讀取上限',
toolOutput: {
@ -274,6 +278,15 @@ export const zhHant = defineLocale({
model: '本機轉寫模型',
language: '轉寫語言'
},
openai: {
model: 'OpenAI STT 模型'
},
groq: {
model: 'Groq STT 模型'
},
mistral: {
model: 'Mistral STT 模型'
},
elevenlabs: {
modelId: 'ElevenLabs STT 模型',
languageCode: 'ElevenLabs 語言',
@ -293,6 +306,33 @@ export const zhHant = defineLocale({
elevenlabs: {
voiceId: 'ElevenLabs 語音',
modelId: 'ElevenLabs 模型'
},
xai: {
voiceId: 'xAI (Grok) 語音',
language: 'xAI 語言'
},
minimax: {
model: 'MiniMax TTS 模型',
voiceId: 'MiniMax 語音'
},
mistral: {
model: 'Mistral TTS 模型',
voiceId: 'Mistral 語音'
},
gemini: {
model: 'Gemini TTS 模型',
voice: 'Gemini 語音'
},
neutts: {
model: 'NeuTTS 模型',
device: 'NeuTTS 裝置'
},
kittentts: {
model: 'KittenTTS 模型',
voice: 'KittenTTS 語音'
},
piper: {
voice: 'Piper 語音'
}
},
memory: {

View file

@ -311,7 +311,11 @@ export const zh: Translations = {
backend: '执行后端',
timeout: '命令超时',
persistentShell: '持久化 Shell',
envPassthrough: '环境变量透传'
envPassthrough: '环境变量透传',
dockerImage: 'Docker 镜像',
singularityImage: 'Singularity 镜像',
modalImage: 'Modal 镜像',
daytonaImage: 'Daytona 镜像'
},
fileReadMaxChars: '文件读取上限',
toolOutput: {
@ -352,6 +356,15 @@ export const zh: Translations = {
model: '本地转写模型',
language: '转写语言'
},
openai: {
model: 'OpenAI STT 模型'
},
groq: {
model: 'Groq STT 模型'
},
mistral: {
model: 'Mistral STT 模型'
},
elevenlabs: {
modelId: 'ElevenLabs STT 模型',
languageCode: 'ElevenLabs 语言',
@ -371,6 +384,33 @@ export const zh: Translations = {
elevenlabs: {
voiceId: 'ElevenLabs 语音',
modelId: 'ElevenLabs 模型'
},
xai: {
voiceId: 'xAI (Grok) 语音',
language: 'xAI 语言'
},
minimax: {
model: 'MiniMax TTS 模型',
voiceId: 'MiniMax 语音'
},
mistral: {
model: 'Mistral TTS 模型',
voiceId: 'Mistral 语音'
},
gemini: {
model: 'Gemini TTS 模型',
voice: 'Gemini 语音'
},
neutts: {
model: 'NeuTTS 模型',
device: 'NeuTTS 设备'
},
kittentts: {
model: 'KittenTTS 模型',
voice: 'KittenTTS 语音'
},
piper: {
voice: 'Piper 语音'
}
},
memory: {