feat(desktop): full tool-backend config (pickers + per-backend settings) in Settings (#41232)

* feat(desktop): surface TTS/STT/terminal backends as Settings dropdowns Every native tool backend that the agent supports now shows up as a clickable picker in the desktop Settings UI instead of a free-text box. Desktop Settings renders a config field as a <Select> only if its dotpath is a key in ENUM_OPTIONS (helpers.ts::enumOptionsFor returns undefined -> free-text <Input> otherwise). Three backend-selector fields were surfaced in their sections but missing from the map, so users had to hand-type the provider name and could reasonably assume it was unsupported: - tts.provider — now lists all built-in TTS backends incl. xai (Grok) - stt.provider — local/groq/openai/mistral/elevenlabs - terminal.backend — local/docker/singularity/modal/daytona/ssh Each list is kept in sync with its backend source of truth (TTS: agent/tts_registry.py::_BUILTIN_NAMES + tools/tts_tool.py; STT + terminal: hermes_cli/config.py / tools/terminal_tool.py). The existing enumOptionsFor current-value-append keeps any hand-typed/legacy value selected, and command-type TTS providers still work. Reported for Grok/xAI TTS, which was already a fully-wired built-in provider (tts.provider: xai + XAI_API_KEY) with no picker entry. * feat(desktop): expose per-backend TTS/STT/terminal config fields in Settings Completes the backend-coverage pass: not just the provider PICKER but every backend's own config fields are now tunable from desktop Settings, so a user who picks (e.g.) Grok TTS can also set its voice/language without hand-editing config.yaml. Also fixes the STT provider dropdown: added 'xai' (Grok STT), which the transcription dispatcher (tools/transcription_tools.py) handles but the config.py comment had omitted — the dispatch ladder is the source of truth. New Settings fields (Voice section): - TTS xai (voice_id, language), minimax (model, voice_id), mistral (model, voice_id), gemini (model, voice), neutts (model, device), kittentts (model, voice), piper (voice) - STT openai (model), groq (model), mistral (model) New Settings fields (Advanced section): - terminal docker_image / singularity_image / modal_image / daytona_image New ENUM_OPTIONS dropdowns: stt.provider (+xai), stt.openai.model, stt.mistral.model, tts.openai.model, tts.elevenlabs.model_id, tts.neutts.device. Each list mirrors the backend generator's accepted values (tools/tts_tool.py, tools/transcription_tools.py, hermes_cli/config.py). i18n: FIELD_LABELS/FIELD_DESCRIPTIONS cover all locales via the English fallback in config-settings.tsx; added native translations to ja/zh/zh-hant. Secrets (provider API keys, modal/daytona tokens, ssh host/key) intentionally stay in Settings -> Keys as env vars, not duplicated as config fields.
2026-06-30 11:52:04 +00:00 · 2026-06-07 10:05:47 -07:00 · 2026-06-07 10:05:47 -07:00 · 20fd0bde5d
commit 20fd0bde5d
parent 0c48b7165d
5 changed files with 261 additions and 6 deletions
--- a/apps/desktop/src/app/settings/constants.ts
+++ b/apps/desktop/src/app/settings/constants.ts
@ -240,9 +240,37 @@ export const ENUM_OPTIONS: Record<string, string[]> = {
  'context.engine': ['compressor', 'default', 'custom'],
  'delegation.reasoning_effort': ['', 'minimal', 'low', 'medium', 'high', 'xhigh'],
  'memory.provider': ['', 'builtin', 'honcho'],
+  // Terminal execution backends — kept in sync with the dispatch ladder in
+  // tools/terminal_tool.py::_create_environment (local/docker/singularity/
+  // modal/daytona/ssh). Remote backends need extra env (image, tokens, host).
+  'terminal.backend': ['local', 'docker', 'singularity', 'modal', 'daytona', 'ssh'],
  'stt.elevenlabs.model_id': ['scribe_v2', 'scribe_v1'],
  'stt.local.model': ['tiny', 'base', 'small', 'medium', 'large-v3'],
+  // Speech-to-text backends — kept in sync with the stt block in
+  // hermes_cli/config.py (local/groq/openai/mistral/elevenlabs).
+  'stt.provider': ['local', 'groq', 'openai', 'mistral', 'xai', 'elevenlabs'],
  'tts.openai.voice': ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'],
+  // Text-to-speech backends — kept in sync with the built-in source of truth
+  // (agent/tts_registry.py::_BUILTIN_NAMES / tools/tts_tool.py::
+  // BUILTIN_TTS_PROVIDERS). 'xai' is Grok TTS.
+  'tts.provider': [
+    'edge',
+    'elevenlabs',
+    'openai',
+    'xai',
+    'minimax',
+    'mistral',
+    'gemini',
+    'neutts',
+    'kittentts',
+    'piper'
+  ],
+  'stt.openai.model': ['whisper-1', 'gpt-4o-mini-transcribe', 'gpt-4o-transcribe'],
+  'stt.mistral.model': ['voxtral-mini-latest', 'voxtral-mini-2602'],
+  'tts.openai.model': ['gpt-4o-mini-tts', 'tts-1', 'tts-1-hd'],
+  'tts.elevenlabs.model_id': ['eleven_multilingual_v2', 'eleven_turbo_v2_5', 'eleven_flash_v2_5'],
+  // NeuTTS local inference device.
+  'tts.neutts.device': ['cpu', 'cuda', 'mps'],
  'updates.non_interactive_local_changes': ['stash', 'discard']
 }

@ -268,7 +296,11 @@ export const FIELD_LABELS: Record<string, string> = defineFieldCopy({
    backend: 'Execution Backend',
    timeout: 'Command Timeout',
    persistentShell: 'Persistent Shell',
-    envPassthrough: 'Environment Passthrough'
+    envPassthrough: 'Environment Passthrough',
+    dockerImage: 'Docker Image',
+    singularityImage: 'Singularity Image',
+    modalImage: 'Modal Image',
+    daytonaImage: 'Daytona Image'
  },
  fileReadMaxChars: 'File Read Limit',
  toolOutput: {
@ -309,6 +341,15 @@ export const FIELD_LABELS: Record<string, string> = defineFieldCopy({
      model: 'Local Transcription Model',
      language: 'Transcription Language'
    },
+    openai: {
+      model: 'OpenAI STT Model'
+    },
+    groq: {
+      model: 'Groq STT Model'
+    },
+    mistral: {
+      model: 'Mistral STT Model'
+    },
    elevenlabs: {
      modelId: 'ElevenLabs STT Model',
      languageCode: 'ElevenLabs Language',
@ -328,6 +369,33 @@ export const FIELD_LABELS: Record<string, string> = defineFieldCopy({
    elevenlabs: {
      voiceId: 'ElevenLabs Voice',
      modelId: 'ElevenLabs Model'
+    },
+    xai: {
+      voiceId: 'xAI (Grok) Voice',
+      language: 'xAI Language'
+    },
+    minimax: {
+      model: 'MiniMax TTS Model',
+      voiceId: 'MiniMax Voice'
+    },
+    mistral: {
+      model: 'Mistral TTS Model',
+      voiceId: 'Mistral Voice'
+    },
+    gemini: {
+      model: 'Gemini TTS Model',
+      voice: 'Gemini Voice'
+    },
+    neutts: {
+      model: 'NeuTTS Model',
+      device: 'NeuTTS Device'
+    },
+    kittentts: {
+      model: 'KittenTTS Model',
+      voice: 'KittenTTS Voice'
+    },
+    piper: {
+      voice: 'Piper Voice'
    }
  },
  memory: {
@ -375,7 +443,11 @@ export const FIELD_DESCRIPTIONS: Record<string, string> = defineFieldCopy({
  terminal: {
    cwd: 'Default project folder for tool and terminal work.',
    persistentShell: 'Keep shell state between commands when the backend supports it.',
-    envPassthrough: 'Environment variables to pass into tool execution.'
+    envPassthrough: 'Environment variables to pass into tool execution.',
+    dockerImage: 'Container image used when the execution backend is Docker.',
+    singularityImage: 'Image used when the execution backend is Singularity.',
+    modalImage: 'Image used when the execution backend is Modal.',
+    daytonaImage: 'Image used when the execution backend is Daytona.'
  },
  codeExecution: {
    mode: 'How strictly code execution is scoped to the current project.'
@ -404,6 +476,15 @@ export const FIELD_DESCRIPTIONS: Record<string, string> = defineFieldCopy({
  voice: {
    autoTts: 'Automatically speak assistant responses.'
  },
+  tts: {
+    xai: {
+      voiceId: 'xAI voice ID (e.g. eve) or a custom voice ID.',
+      language: 'Spoken language code, e.g. en.'
+    },
+    neutts: {
+      device: 'Local inference device for NeuTTS.'
+    }
+  },
  stt: {
    enabled: 'Enable local or provider-backed speech transcription.',
    elevenlabs: {
@ -495,8 +576,24 @@ export const SECTIONS: DesktopConfigSection[] = [
      'tts.openai.voice',
      'tts.elevenlabs.voice_id',
      'tts.elevenlabs.model_id',
+      'tts.xai.voice_id',
+      'tts.xai.language',
+      'tts.minimax.model',
+      'tts.minimax.voice_id',
+      'tts.mistral.model',
+      'tts.mistral.voice_id',
+      'tts.gemini.model',
+      'tts.gemini.voice',
+      'tts.neutts.model',
+      'tts.neutts.device',
+      'tts.kittentts.model',
+      'tts.kittentts.voice',
+      'tts.piper.voice',
      'stt.local.model',
      'stt.local.language',
+      'stt.openai.model',
+      'stt.groq.model',
+      'stt.mistral.model',
      'stt.elevenlabs.model_id',
      'stt.elevenlabs.language_code',
      'stt.elevenlabs.tag_audio_events',
@ -513,6 +610,10 @@ export const SECTIONS: DesktopConfigSection[] = [
      'toolsets',
      'terminal.backend',
      'terminal.timeout',
+      'terminal.docker_image',
+      'terminal.singularity_image',
+      'terminal.modal_image',
+      'terminal.daytona_image',
      'tool_output.max_bytes',
      'tool_output.max_lines',
      'tool_output.max_line_length',
--- a/apps/desktop/src/app/settings/helpers.test.ts
+++ b/apps/desktop/src/app/settings/helpers.test.ts
@ -3,7 +3,7 @@ import { describe, expect, it } from 'vitest'
 import type { HermesConfigRecord } from '@/types/hermes'

 import { defineFieldCopy, fieldCopyForSchemaKey, schemaKeyToFieldCopyKey } from './field-copy'
-import { getNested, providerGroup, setNested, stripToolsetLabel, toolsetDisplayLabel } from './helpers'
+import { enumOptionsFor, getNested, providerGroup, setNested, stripToolsetLabel, toolsetDisplayLabel } from './helpers'

 describe('settings helpers', () => {
  describe('defineFieldCopy', () => {
@ -135,4 +135,38 @@ describe('settings helpers', () => {
      expect(providerGroup('SOMETHING_RANDOM')).toBe('Other')
    })
  })
+
+  describe('enumOptionsFor — backend selector dropdowns', () => {
+    const config: HermesConfigRecord = {}
+
+    it('renders a dropdown for the TTS provider including xAI (Grok)', () => {
+      const opts = enumOptionsFor('tts.provider', 'edge', config)
+      expect(opts).toBeDefined()
+      expect(opts).toContain('xai')
+      expect(opts).toContain('edge')
+      expect(opts).toContain('elevenlabs')
+    })
+
+    it('renders a dropdown for the STT provider including xAI (Grok)', () => {
+      const opts = enumOptionsFor('stt.provider', 'local', config)
+      expect(opts).toEqual(['local', 'groq', 'openai', 'mistral', 'xai', 'elevenlabs'])
+    })
+
+    it('renders dropdowns for per-backend model/device sub-fields', () => {
+      expect(enumOptionsFor('stt.openai.model', 'whisper-1', config)).toContain('gpt-4o-transcribe')
+      expect(enumOptionsFor('tts.openai.model', 'gpt-4o-mini-tts', config)).toContain('tts-1-hd')
+      expect(enumOptionsFor('tts.neutts.device', 'cpu', config)).toEqual(['cpu', 'cuda', 'mps'])
+    })
+
+    it('renders a dropdown for the terminal execution backend', () => {
+      const opts = enumOptionsFor('terminal.backend', 'local', config)
+      expect(opts).toEqual(['local', 'docker', 'singularity', 'modal', 'daytona', 'ssh'])
+    })
+
+    it('appends a hand-typed value not in the known list so it stays selected', () => {
+      const opts = enumOptionsFor('tts.provider', 'my-custom-command-tts', config)
+      expect(opts).toContain('my-custom-command-tts')
+      expect(opts).toContain('xai')
+    })
+  })
 })
--- a/apps/desktop/src/i18n/ja.ts
+++ b/apps/desktop/src/i18n/ja.ts
@ -239,7 +239,11 @@ export const ja = defineLocale({
        backend: '実行バックエンド',
        timeout: 'コマンドタイムアウト',
        persistentShell: '永続シェル',
-        envPassthrough: '環境変数の引き継ぎ'
+        envPassthrough: '環境変数の引き継ぎ',
+        dockerImage: 'Docker イメージ',
+        singularityImage: 'Singularity イメージ',
+        modalImage: 'Modal イメージ',
+        daytonaImage: 'Daytona イメージ'
      },
      fileReadMaxChars: 'ファイル読み取り上限',
      toolOutput: {
@ -280,6 +284,15 @@ export const ja = defineLocale({
          model: 'ローカル文字起こしモデル',
          language: '文字起こし言語'
        },
+        openai: {
+          model: 'OpenAI STT モデル'
+        },
+        groq: {
+          model: 'Groq STT モデル'
+        },
+        mistral: {
+          model: 'Mistral STT モデル'
+        },
        elevenlabs: {
          modelId: 'ElevenLabs STT モデル',
          languageCode: 'ElevenLabs 言語',
@ -299,6 +312,33 @@ export const ja = defineLocale({
        elevenlabs: {
          voiceId: 'ElevenLabs 音声',
          modelId: 'ElevenLabs モデル'
+        },
+        xai: {
+          voiceId: 'xAI (Grok) 音声',
+          language: 'xAI 言語'
+        },
+        minimax: {
+          model: 'MiniMax TTS モデル',
+          voiceId: 'MiniMax 音声'
+        },
+        mistral: {
+          model: 'Mistral TTS モデル',
+          voiceId: 'Mistral 音声'
+        },
+        gemini: {
+          model: 'Gemini TTS モデル',
+          voice: 'Gemini 音声'
+        },
+        neutts: {
+          model: 'NeuTTS モデル',
+          device: 'NeuTTS デバイス'
+        },
+        kittentts: {
+          model: 'KittenTTS モデル',
+          voice: 'KittenTTS 音声'
+        },
+        piper: {
+          voice: 'Piper 音声'
        }
      },
      memory: {
--- a/apps/desktop/src/i18n/zh-hant.ts
+++ b/apps/desktop/src/i18n/zh-hant.ts
@ -233,7 +233,11 @@ export const zhHant = defineLocale({
        backend: '執行後端',
        timeout: '指令逾時',
        persistentShell: '持久化 Shell',
-        envPassthrough: '環境變數傳遞'
+        envPassthrough: '環境變數傳遞',
+        dockerImage: 'Docker 映像',
+        singularityImage: 'Singularity 映像',
+        modalImage: 'Modal 映像',
+        daytonaImage: 'Daytona 映像'
      },
      fileReadMaxChars: '檔案讀取上限',
      toolOutput: {
@ -274,6 +278,15 @@ export const zhHant = defineLocale({
          model: '本機轉寫模型',
          language: '轉寫語言'
        },
+        openai: {
+          model: 'OpenAI STT 模型'
+        },
+        groq: {
+          model: 'Groq STT 模型'
+        },
+        mistral: {
+          model: 'Mistral STT 模型'
+        },
        elevenlabs: {
          modelId: 'ElevenLabs STT 模型',
          languageCode: 'ElevenLabs 語言',
@ -293,6 +306,33 @@ export const zhHant = defineLocale({
        elevenlabs: {
          voiceId: 'ElevenLabs 語音',
          modelId: 'ElevenLabs 模型'
+        },
+        xai: {
+          voiceId: 'xAI (Grok) 語音',
+          language: 'xAI 語言'
+        },
+        minimax: {
+          model: 'MiniMax TTS 模型',
+          voiceId: 'MiniMax 語音'
+        },
+        mistral: {
+          model: 'Mistral TTS 模型',
+          voiceId: 'Mistral 語音'
+        },
+        gemini: {
+          model: 'Gemini TTS 模型',
+          voice: 'Gemini 語音'
+        },
+        neutts: {
+          model: 'NeuTTS 模型',
+          device: 'NeuTTS 裝置'
+        },
+        kittentts: {
+          model: 'KittenTTS 模型',
+          voice: 'KittenTTS 語音'
+        },
+        piper: {
+          voice: 'Piper 語音'
        }
      },
      memory: {
--- a/apps/desktop/src/i18n/zh.ts
+++ b/apps/desktop/src/i18n/zh.ts
@ -311,7 +311,11 @@ export const zh: Translations = {
        backend: '执行后端',
        timeout: '命令超时',
        persistentShell: '持久化 Shell',
-        envPassthrough: '环境变量透传'
+        envPassthrough: '环境变量透传',
+        dockerImage: 'Docker 镜像',
+        singularityImage: 'Singularity 镜像',
+        modalImage: 'Modal 镜像',
+        daytonaImage: 'Daytona 镜像'
      },
      fileReadMaxChars: '文件读取上限',
      toolOutput: {
@ -352,6 +356,15 @@ export const zh: Translations = {
          model: '本地转写模型',
          language: '转写语言'
        },
+        openai: {
+          model: 'OpenAI STT 模型'
+        },
+        groq: {
+          model: 'Groq STT 模型'
+        },
+        mistral: {
+          model: 'Mistral STT 模型'
+        },
        elevenlabs: {
          modelId: 'ElevenLabs STT 模型',
          languageCode: 'ElevenLabs 语言',
@ -371,6 +384,33 @@ export const zh: Translations = {
        elevenlabs: {
          voiceId: 'ElevenLabs 语音',
          modelId: 'ElevenLabs 模型'
+        },
+        xai: {
+          voiceId: 'xAI (Grok) 语音',
+          language: 'xAI 语言'
+        },
+        minimax: {
+          model: 'MiniMax TTS 模型',
+          voiceId: 'MiniMax 语音'
+        },
+        mistral: {
+          model: 'Mistral TTS 模型',
+          voiceId: 'Mistral 语音'
+        },
+        gemini: {
+          model: 'Gemini TTS 模型',
+          voice: 'Gemini 语音'
+        },
+        neutts: {
+          model: 'NeuTTS 模型',
+          device: 'NeuTTS 设备'
+        },
+        kittentts: {
+          model: 'KittenTTS 模型',
+          voice: 'KittenTTS 语音'
+        },
+        piper: {
+          voice: 'Piper 语音'
        }
      },
      memory: {