mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
feat(desktop): full tool-backend config (pickers + per-backend settings) in Settings (#41232)
* feat(desktop): surface TTS/STT/terminal backends as Settings dropdowns Every native tool backend that the agent supports now shows up as a clickable picker in the desktop Settings UI instead of a free-text box. Desktop Settings renders a config field as a <Select> only if its dotpath is a key in ENUM_OPTIONS (helpers.ts::enumOptionsFor returns undefined -> free-text <Input> otherwise). Three backend-selector fields were surfaced in their sections but missing from the map, so users had to hand-type the provider name and could reasonably assume it was unsupported: - tts.provider — now lists all built-in TTS backends incl. xai (Grok) - stt.provider — local/groq/openai/mistral/elevenlabs - terminal.backend — local/docker/singularity/modal/daytona/ssh Each list is kept in sync with its backend source of truth (TTS: agent/tts_registry.py::_BUILTIN_NAMES + tools/tts_tool.py; STT + terminal: hermes_cli/config.py / tools/terminal_tool.py). The existing enumOptionsFor current-value-append keeps any hand-typed/legacy value selected, and command-type TTS providers still work. Reported for Grok/xAI TTS, which was already a fully-wired built-in provider (tts.provider: xai + XAI_API_KEY) with no picker entry. * feat(desktop): expose per-backend TTS/STT/terminal config fields in Settings Completes the backend-coverage pass: not just the provider PICKER but every backend's own config fields are now tunable from desktop Settings, so a user who picks (e.g.) Grok TTS can also set its voice/language without hand-editing config.yaml. Also fixes the STT provider dropdown: added 'xai' (Grok STT), which the transcription dispatcher (tools/transcription_tools.py) handles but the config.py comment had omitted — the dispatch ladder is the source of truth. New Settings fields (Voice section): - TTS xai (voice_id, language), minimax (model, voice_id), mistral (model, voice_id), gemini (model, voice), neutts (model, device), kittentts (model, voice), piper (voice) - STT openai (model), groq (model), mistral (model) New Settings fields (Advanced section): - terminal docker_image / singularity_image / modal_image / daytona_image New ENUM_OPTIONS dropdowns: stt.provider (+xai), stt.openai.model, stt.mistral.model, tts.openai.model, tts.elevenlabs.model_id, tts.neutts.device. Each list mirrors the backend generator's accepted values (tools/tts_tool.py, tools/transcription_tools.py, hermes_cli/config.py). i18n: FIELD_LABELS/FIELD_DESCRIPTIONS cover all locales via the English fallback in config-settings.tsx; added native translations to ja/zh/zh-hant. Secrets (provider API keys, modal/daytona tokens, ssh host/key) intentionally stay in Settings -> Keys as env vars, not duplicated as config fields.
This commit is contained in:
parent
0c48b7165d
commit
20fd0bde5d
5 changed files with 261 additions and 6 deletions
|
|
@ -240,9 +240,37 @@ export const ENUM_OPTIONS: Record<string, string[]> = {
|
|||
'context.engine': ['compressor', 'default', 'custom'],
|
||||
'delegation.reasoning_effort': ['', 'minimal', 'low', 'medium', 'high', 'xhigh'],
|
||||
'memory.provider': ['', 'builtin', 'honcho'],
|
||||
// Terminal execution backends — kept in sync with the dispatch ladder in
|
||||
// tools/terminal_tool.py::_create_environment (local/docker/singularity/
|
||||
// modal/daytona/ssh). Remote backends need extra env (image, tokens, host).
|
||||
'terminal.backend': ['local', 'docker', 'singularity', 'modal', 'daytona', 'ssh'],
|
||||
'stt.elevenlabs.model_id': ['scribe_v2', 'scribe_v1'],
|
||||
'stt.local.model': ['tiny', 'base', 'small', 'medium', 'large-v3'],
|
||||
// Speech-to-text backends — kept in sync with the stt block in
|
||||
// hermes_cli/config.py (local/groq/openai/mistral/elevenlabs).
|
||||
'stt.provider': ['local', 'groq', 'openai', 'mistral', 'xai', 'elevenlabs'],
|
||||
'tts.openai.voice': ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'],
|
||||
// Text-to-speech backends — kept in sync with the built-in source of truth
|
||||
// (agent/tts_registry.py::_BUILTIN_NAMES / tools/tts_tool.py::
|
||||
// BUILTIN_TTS_PROVIDERS). 'xai' is Grok TTS.
|
||||
'tts.provider': [
|
||||
'edge',
|
||||
'elevenlabs',
|
||||
'openai',
|
||||
'xai',
|
||||
'minimax',
|
||||
'mistral',
|
||||
'gemini',
|
||||
'neutts',
|
||||
'kittentts',
|
||||
'piper'
|
||||
],
|
||||
'stt.openai.model': ['whisper-1', 'gpt-4o-mini-transcribe', 'gpt-4o-transcribe'],
|
||||
'stt.mistral.model': ['voxtral-mini-latest', 'voxtral-mini-2602'],
|
||||
'tts.openai.model': ['gpt-4o-mini-tts', 'tts-1', 'tts-1-hd'],
|
||||
'tts.elevenlabs.model_id': ['eleven_multilingual_v2', 'eleven_turbo_v2_5', 'eleven_flash_v2_5'],
|
||||
// NeuTTS local inference device.
|
||||
'tts.neutts.device': ['cpu', 'cuda', 'mps'],
|
||||
'updates.non_interactive_local_changes': ['stash', 'discard']
|
||||
}
|
||||
|
||||
|
|
@ -268,7 +296,11 @@ export const FIELD_LABELS: Record<string, string> = defineFieldCopy({
|
|||
backend: 'Execution Backend',
|
||||
timeout: 'Command Timeout',
|
||||
persistentShell: 'Persistent Shell',
|
||||
envPassthrough: 'Environment Passthrough'
|
||||
envPassthrough: 'Environment Passthrough',
|
||||
dockerImage: 'Docker Image',
|
||||
singularityImage: 'Singularity Image',
|
||||
modalImage: 'Modal Image',
|
||||
daytonaImage: 'Daytona Image'
|
||||
},
|
||||
fileReadMaxChars: 'File Read Limit',
|
||||
toolOutput: {
|
||||
|
|
@ -309,6 +341,15 @@ export const FIELD_LABELS: Record<string, string> = defineFieldCopy({
|
|||
model: 'Local Transcription Model',
|
||||
language: 'Transcription Language'
|
||||
},
|
||||
openai: {
|
||||
model: 'OpenAI STT Model'
|
||||
},
|
||||
groq: {
|
||||
model: 'Groq STT Model'
|
||||
},
|
||||
mistral: {
|
||||
model: 'Mistral STT Model'
|
||||
},
|
||||
elevenlabs: {
|
||||
modelId: 'ElevenLabs STT Model',
|
||||
languageCode: 'ElevenLabs Language',
|
||||
|
|
@ -328,6 +369,33 @@ export const FIELD_LABELS: Record<string, string> = defineFieldCopy({
|
|||
elevenlabs: {
|
||||
voiceId: 'ElevenLabs Voice',
|
||||
modelId: 'ElevenLabs Model'
|
||||
},
|
||||
xai: {
|
||||
voiceId: 'xAI (Grok) Voice',
|
||||
language: 'xAI Language'
|
||||
},
|
||||
minimax: {
|
||||
model: 'MiniMax TTS Model',
|
||||
voiceId: 'MiniMax Voice'
|
||||
},
|
||||
mistral: {
|
||||
model: 'Mistral TTS Model',
|
||||
voiceId: 'Mistral Voice'
|
||||
},
|
||||
gemini: {
|
||||
model: 'Gemini TTS Model',
|
||||
voice: 'Gemini Voice'
|
||||
},
|
||||
neutts: {
|
||||
model: 'NeuTTS Model',
|
||||
device: 'NeuTTS Device'
|
||||
},
|
||||
kittentts: {
|
||||
model: 'KittenTTS Model',
|
||||
voice: 'KittenTTS Voice'
|
||||
},
|
||||
piper: {
|
||||
voice: 'Piper Voice'
|
||||
}
|
||||
},
|
||||
memory: {
|
||||
|
|
@ -375,7 +443,11 @@ export const FIELD_DESCRIPTIONS: Record<string, string> = defineFieldCopy({
|
|||
terminal: {
|
||||
cwd: 'Default project folder for tool and terminal work.',
|
||||
persistentShell: 'Keep shell state between commands when the backend supports it.',
|
||||
envPassthrough: 'Environment variables to pass into tool execution.'
|
||||
envPassthrough: 'Environment variables to pass into tool execution.',
|
||||
dockerImage: 'Container image used when the execution backend is Docker.',
|
||||
singularityImage: 'Image used when the execution backend is Singularity.',
|
||||
modalImage: 'Image used when the execution backend is Modal.',
|
||||
daytonaImage: 'Image used when the execution backend is Daytona.'
|
||||
},
|
||||
codeExecution: {
|
||||
mode: 'How strictly code execution is scoped to the current project.'
|
||||
|
|
@ -404,6 +476,15 @@ export const FIELD_DESCRIPTIONS: Record<string, string> = defineFieldCopy({
|
|||
voice: {
|
||||
autoTts: 'Automatically speak assistant responses.'
|
||||
},
|
||||
tts: {
|
||||
xai: {
|
||||
voiceId: 'xAI voice ID (e.g. eve) or a custom voice ID.',
|
||||
language: 'Spoken language code, e.g. en.'
|
||||
},
|
||||
neutts: {
|
||||
device: 'Local inference device for NeuTTS.'
|
||||
}
|
||||
},
|
||||
stt: {
|
||||
enabled: 'Enable local or provider-backed speech transcription.',
|
||||
elevenlabs: {
|
||||
|
|
@ -495,8 +576,24 @@ export const SECTIONS: DesktopConfigSection[] = [
|
|||
'tts.openai.voice',
|
||||
'tts.elevenlabs.voice_id',
|
||||
'tts.elevenlabs.model_id',
|
||||
'tts.xai.voice_id',
|
||||
'tts.xai.language',
|
||||
'tts.minimax.model',
|
||||
'tts.minimax.voice_id',
|
||||
'tts.mistral.model',
|
||||
'tts.mistral.voice_id',
|
||||
'tts.gemini.model',
|
||||
'tts.gemini.voice',
|
||||
'tts.neutts.model',
|
||||
'tts.neutts.device',
|
||||
'tts.kittentts.model',
|
||||
'tts.kittentts.voice',
|
||||
'tts.piper.voice',
|
||||
'stt.local.model',
|
||||
'stt.local.language',
|
||||
'stt.openai.model',
|
||||
'stt.groq.model',
|
||||
'stt.mistral.model',
|
||||
'stt.elevenlabs.model_id',
|
||||
'stt.elevenlabs.language_code',
|
||||
'stt.elevenlabs.tag_audio_events',
|
||||
|
|
@ -513,6 +610,10 @@ export const SECTIONS: DesktopConfigSection[] = [
|
|||
'toolsets',
|
||||
'terminal.backend',
|
||||
'terminal.timeout',
|
||||
'terminal.docker_image',
|
||||
'terminal.singularity_image',
|
||||
'terminal.modal_image',
|
||||
'terminal.daytona_image',
|
||||
'tool_output.max_bytes',
|
||||
'tool_output.max_lines',
|
||||
'tool_output.max_line_length',
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import { describe, expect, it } from 'vitest'
|
|||
import type { HermesConfigRecord } from '@/types/hermes'
|
||||
|
||||
import { defineFieldCopy, fieldCopyForSchemaKey, schemaKeyToFieldCopyKey } from './field-copy'
|
||||
import { getNested, providerGroup, setNested, stripToolsetLabel, toolsetDisplayLabel } from './helpers'
|
||||
import { enumOptionsFor, getNested, providerGroup, setNested, stripToolsetLabel, toolsetDisplayLabel } from './helpers'
|
||||
|
||||
describe('settings helpers', () => {
|
||||
describe('defineFieldCopy', () => {
|
||||
|
|
@ -135,4 +135,38 @@ describe('settings helpers', () => {
|
|||
expect(providerGroup('SOMETHING_RANDOM')).toBe('Other')
|
||||
})
|
||||
})
|
||||
|
||||
describe('enumOptionsFor — backend selector dropdowns', () => {
|
||||
const config: HermesConfigRecord = {}
|
||||
|
||||
it('renders a dropdown for the TTS provider including xAI (Grok)', () => {
|
||||
const opts = enumOptionsFor('tts.provider', 'edge', config)
|
||||
expect(opts).toBeDefined()
|
||||
expect(opts).toContain('xai')
|
||||
expect(opts).toContain('edge')
|
||||
expect(opts).toContain('elevenlabs')
|
||||
})
|
||||
|
||||
it('renders a dropdown for the STT provider including xAI (Grok)', () => {
|
||||
const opts = enumOptionsFor('stt.provider', 'local', config)
|
||||
expect(opts).toEqual(['local', 'groq', 'openai', 'mistral', 'xai', 'elevenlabs'])
|
||||
})
|
||||
|
||||
it('renders dropdowns for per-backend model/device sub-fields', () => {
|
||||
expect(enumOptionsFor('stt.openai.model', 'whisper-1', config)).toContain('gpt-4o-transcribe')
|
||||
expect(enumOptionsFor('tts.openai.model', 'gpt-4o-mini-tts', config)).toContain('tts-1-hd')
|
||||
expect(enumOptionsFor('tts.neutts.device', 'cpu', config)).toEqual(['cpu', 'cuda', 'mps'])
|
||||
})
|
||||
|
||||
it('renders a dropdown for the terminal execution backend', () => {
|
||||
const opts = enumOptionsFor('terminal.backend', 'local', config)
|
||||
expect(opts).toEqual(['local', 'docker', 'singularity', 'modal', 'daytona', 'ssh'])
|
||||
})
|
||||
|
||||
it('appends a hand-typed value not in the known list so it stays selected', () => {
|
||||
const opts = enumOptionsFor('tts.provider', 'my-custom-command-tts', config)
|
||||
expect(opts).toContain('my-custom-command-tts')
|
||||
expect(opts).toContain('xai')
|
||||
})
|
||||
})
|
||||
})
|
||||
|
|
|
|||
|
|
@ -239,7 +239,11 @@ export const ja = defineLocale({
|
|||
backend: '実行バックエンド',
|
||||
timeout: 'コマンドタイムアウト',
|
||||
persistentShell: '永続シェル',
|
||||
envPassthrough: '環境変数の引き継ぎ'
|
||||
envPassthrough: '環境変数の引き継ぎ',
|
||||
dockerImage: 'Docker イメージ',
|
||||
singularityImage: 'Singularity イメージ',
|
||||
modalImage: 'Modal イメージ',
|
||||
daytonaImage: 'Daytona イメージ'
|
||||
},
|
||||
fileReadMaxChars: 'ファイル読み取り上限',
|
||||
toolOutput: {
|
||||
|
|
@ -280,6 +284,15 @@ export const ja = defineLocale({
|
|||
model: 'ローカル文字起こしモデル',
|
||||
language: '文字起こし言語'
|
||||
},
|
||||
openai: {
|
||||
model: 'OpenAI STT モデル'
|
||||
},
|
||||
groq: {
|
||||
model: 'Groq STT モデル'
|
||||
},
|
||||
mistral: {
|
||||
model: 'Mistral STT モデル'
|
||||
},
|
||||
elevenlabs: {
|
||||
modelId: 'ElevenLabs STT モデル',
|
||||
languageCode: 'ElevenLabs 言語',
|
||||
|
|
@ -299,6 +312,33 @@ export const ja = defineLocale({
|
|||
elevenlabs: {
|
||||
voiceId: 'ElevenLabs 音声',
|
||||
modelId: 'ElevenLabs モデル'
|
||||
},
|
||||
xai: {
|
||||
voiceId: 'xAI (Grok) 音声',
|
||||
language: 'xAI 言語'
|
||||
},
|
||||
minimax: {
|
||||
model: 'MiniMax TTS モデル',
|
||||
voiceId: 'MiniMax 音声'
|
||||
},
|
||||
mistral: {
|
||||
model: 'Mistral TTS モデル',
|
||||
voiceId: 'Mistral 音声'
|
||||
},
|
||||
gemini: {
|
||||
model: 'Gemini TTS モデル',
|
||||
voice: 'Gemini 音声'
|
||||
},
|
||||
neutts: {
|
||||
model: 'NeuTTS モデル',
|
||||
device: 'NeuTTS デバイス'
|
||||
},
|
||||
kittentts: {
|
||||
model: 'KittenTTS モデル',
|
||||
voice: 'KittenTTS 音声'
|
||||
},
|
||||
piper: {
|
||||
voice: 'Piper 音声'
|
||||
}
|
||||
},
|
||||
memory: {
|
||||
|
|
|
|||
|
|
@ -233,7 +233,11 @@ export const zhHant = defineLocale({
|
|||
backend: '執行後端',
|
||||
timeout: '指令逾時',
|
||||
persistentShell: '持久化 Shell',
|
||||
envPassthrough: '環境變數傳遞'
|
||||
envPassthrough: '環境變數傳遞',
|
||||
dockerImage: 'Docker 映像',
|
||||
singularityImage: 'Singularity 映像',
|
||||
modalImage: 'Modal 映像',
|
||||
daytonaImage: 'Daytona 映像'
|
||||
},
|
||||
fileReadMaxChars: '檔案讀取上限',
|
||||
toolOutput: {
|
||||
|
|
@ -274,6 +278,15 @@ export const zhHant = defineLocale({
|
|||
model: '本機轉寫模型',
|
||||
language: '轉寫語言'
|
||||
},
|
||||
openai: {
|
||||
model: 'OpenAI STT 模型'
|
||||
},
|
||||
groq: {
|
||||
model: 'Groq STT 模型'
|
||||
},
|
||||
mistral: {
|
||||
model: 'Mistral STT 模型'
|
||||
},
|
||||
elevenlabs: {
|
||||
modelId: 'ElevenLabs STT 模型',
|
||||
languageCode: 'ElevenLabs 語言',
|
||||
|
|
@ -293,6 +306,33 @@ export const zhHant = defineLocale({
|
|||
elevenlabs: {
|
||||
voiceId: 'ElevenLabs 語音',
|
||||
modelId: 'ElevenLabs 模型'
|
||||
},
|
||||
xai: {
|
||||
voiceId: 'xAI (Grok) 語音',
|
||||
language: 'xAI 語言'
|
||||
},
|
||||
minimax: {
|
||||
model: 'MiniMax TTS 模型',
|
||||
voiceId: 'MiniMax 語音'
|
||||
},
|
||||
mistral: {
|
||||
model: 'Mistral TTS 模型',
|
||||
voiceId: 'Mistral 語音'
|
||||
},
|
||||
gemini: {
|
||||
model: 'Gemini TTS 模型',
|
||||
voice: 'Gemini 語音'
|
||||
},
|
||||
neutts: {
|
||||
model: 'NeuTTS 模型',
|
||||
device: 'NeuTTS 裝置'
|
||||
},
|
||||
kittentts: {
|
||||
model: 'KittenTTS 模型',
|
||||
voice: 'KittenTTS 語音'
|
||||
},
|
||||
piper: {
|
||||
voice: 'Piper 語音'
|
||||
}
|
||||
},
|
||||
memory: {
|
||||
|
|
|
|||
|
|
@ -311,7 +311,11 @@ export const zh: Translations = {
|
|||
backend: '执行后端',
|
||||
timeout: '命令超时',
|
||||
persistentShell: '持久化 Shell',
|
||||
envPassthrough: '环境变量透传'
|
||||
envPassthrough: '环境变量透传',
|
||||
dockerImage: 'Docker 镜像',
|
||||
singularityImage: 'Singularity 镜像',
|
||||
modalImage: 'Modal 镜像',
|
||||
daytonaImage: 'Daytona 镜像'
|
||||
},
|
||||
fileReadMaxChars: '文件读取上限',
|
||||
toolOutput: {
|
||||
|
|
@ -352,6 +356,15 @@ export const zh: Translations = {
|
|||
model: '本地转写模型',
|
||||
language: '转写语言'
|
||||
},
|
||||
openai: {
|
||||
model: 'OpenAI STT 模型'
|
||||
},
|
||||
groq: {
|
||||
model: 'Groq STT 模型'
|
||||
},
|
||||
mistral: {
|
||||
model: 'Mistral STT 模型'
|
||||
},
|
||||
elevenlabs: {
|
||||
modelId: 'ElevenLabs STT 模型',
|
||||
languageCode: 'ElevenLabs 语言',
|
||||
|
|
@ -371,6 +384,33 @@ export const zh: Translations = {
|
|||
elevenlabs: {
|
||||
voiceId: 'ElevenLabs 语音',
|
||||
modelId: 'ElevenLabs 模型'
|
||||
},
|
||||
xai: {
|
||||
voiceId: 'xAI (Grok) 语音',
|
||||
language: 'xAI 语言'
|
||||
},
|
||||
minimax: {
|
||||
model: 'MiniMax TTS 模型',
|
||||
voiceId: 'MiniMax 语音'
|
||||
},
|
||||
mistral: {
|
||||
model: 'Mistral TTS 模型',
|
||||
voiceId: 'Mistral 语音'
|
||||
},
|
||||
gemini: {
|
||||
model: 'Gemini TTS 模型',
|
||||
voice: 'Gemini 语音'
|
||||
},
|
||||
neutts: {
|
||||
model: 'NeuTTS 模型',
|
||||
device: 'NeuTTS 设备'
|
||||
},
|
||||
kittentts: {
|
||||
model: 'KittenTTS 模型',
|
||||
voice: 'KittenTTS 语音'
|
||||
},
|
||||
piper: {
|
||||
voice: 'Piper 语音'
|
||||
}
|
||||
},
|
||||
memory: {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue