diff --git a/agent/lmstudio_reasoning.py b/agent/lmstudio_reasoning.py new file mode 100644 index 0000000000..48ca667353 --- /dev/null +++ b/agent/lmstudio_reasoning.py @@ -0,0 +1,48 @@ +"""LM Studio reasoning-effort resolution shared by the chat-completions +transport and run_agent's iteration-limit summary path. + +LM Studio publishes per-model ``capabilities.reasoning.allowed_options`` (e.g. +``["off","on"]`` for toggle-style models, ``["off","minimal","low"]`` for +graduated models). We map the user's ``reasoning_config`` onto LM Studio's +OpenAI-compatible vocabulary, then clamp against the model's allowed set so +the server doesn't 400 on an unsupported effort. +""" + +from __future__ import annotations + +from typing import List, Optional + +# LM Studio accepts these top-level reasoning_effort values via its +# OpenAI-compatible chat.completions endpoint. +_LM_VALID_EFFORTS = {"none", "minimal", "low", "medium", "high", "xhigh"} + +# Toggle-style models publish allowed_options as ["off","on"] in /api/v1/models. +# Map them onto the OpenAI-compatible request vocabulary. +_LM_EFFORT_ALIASES = {"off": "none", "on": "medium"} + + +def resolve_lmstudio_effort( + reasoning_config: Optional[dict], + allowed_options: Optional[List[str]], +) -> Optional[str]: + """Return the ``reasoning_effort`` string to send to LM Studio, or ``None``. + + ``None`` means "omit the field": the user picked a level the model can't + honor, so let LM Studio fall back to the model's declared default rather + than silently substituting a different effort. When ``allowed_options`` is + falsy (probe failed), skip clamping and send the resolved effort anyway. + """ + effort = "medium" + if reasoning_config and isinstance(reasoning_config, dict): + if reasoning_config.get("enabled") is False: + effort = "none" + else: + raw = (reasoning_config.get("effort") or "").strip().lower() + raw = _LM_EFFORT_ALIASES.get(raw, raw) + if raw in _LM_VALID_EFFORTS: + effort = raw + if allowed_options: + allowed = {_LM_EFFORT_ALIASES.get(opt, opt) for opt in allowed_options} + if effort not in allowed: + return None + return effort diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 44135e2e65..d883263e62 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -1281,7 +1281,10 @@ def get_model_context_length( model = _strip_provider_prefix(model) # 1. Check persistent cache (model+provider) - if base_url: + # LM Studio is excluded — its loaded context length is transient (the + # user can reload the model with a different context_length at any time + # via /api/v1/models/load), so a stale cached value would mask reloads. + if base_url and provider != "lmstudio": cached = get_cached_context_length(model, base_url) if cached is not None: # Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds @@ -1334,7 +1337,8 @@ def get_model_context_length( if is_local_endpoint(base_url): local_ctx = _query_local_context_length(model, base_url, api_key=api_key) if local_ctx and local_ctx > 0: - save_context_length(model, base_url, local_ctx) + if provider != "lmstudio": + save_context_length(model, base_url, local_ctx) return local_ctx logger.info( "Could not detect context length for model %r at %s — " @@ -1424,7 +1428,8 @@ def get_model_context_length( if base_url and is_local_endpoint(base_url): local_ctx = _query_local_context_length(model, base_url, api_key=api_key) if local_ctx and local_ctx > 0: - save_context_length(model, base_url, local_ctx) + if provider != "lmstudio": + save_context_length(model, base_url, local_ctx) return local_ctx # 10. Default fallback — 128K diff --git a/agent/transports/chat_completions.py b/agent/transports/chat_completions.py index 480ba05d27..9c2db2c6c7 100644 --- a/agent/transports/chat_completions.py +++ b/agent/transports/chat_completions.py @@ -12,6 +12,7 @@ reasoning configuration, temperature handling, and extra_body assembly. import copy from typing import Any, Dict, List, Optional +from agent.lmstudio_reasoning import resolve_lmstudio_effort from agent.moonshot_schema import is_moonshot_model, sanitize_moonshot_tools from agent.prompt_builder import DEVELOPER_ROLE_MODELS from agent.transports.base import ProviderTransport @@ -153,6 +154,8 @@ class ChatCompletionsTransport(ProviderTransport): is_github_models: bool is_nvidia_nim: bool is_kimi: bool + is_tokenhub: bool + is_lmstudio: bool is_custom_provider: bool ollama_num_ctx: int | None # Provider routing @@ -166,6 +169,7 @@ class ChatCompletionsTransport(ProviderTransport): # Reasoning supports_reasoning: bool github_reasoning_extra: dict | None + lmstudio_reasoning_options: list[str] | None # raw allowed_options from /api/v1/models # Claude on OpenRouter/Nous max output anthropic_max_output: int | None # Extra @@ -287,6 +291,18 @@ class ChatCompletionsTransport(ProviderTransport): _tokenhub_effort = _e api_kwargs["reasoning_effort"] = _tokenhub_effort + # LM Studio: top-level reasoning_effort. Only emit when the model + # declares reasoning support via /api/v1/models capabilities (gated + # upstream by params["supports_reasoning"]). resolve_lmstudio_effort + # is shared with run_agent's summary path so both stay in sync. + if params.get("is_lmstudio", False) and params.get("supports_reasoning", False): + _lm_effort = resolve_lmstudio_effort( + reasoning_config, + params.get("lmstudio_reasoning_options"), + ) + if _lm_effort is not None: + api_kwargs["reasoning_effort"] = _lm_effort + # extra_body assembly extra_body: Dict[str, Any] = {} @@ -309,8 +325,9 @@ class ChatCompletionsTransport(ProviderTransport): "type": "enabled" if _kimi_thinking_enabled else "disabled", } - # Reasoning - if params.get("supports_reasoning", False): + # Reasoning. LM Studio is handled above via top-level reasoning_effort, + # so skip emitting extra_body.reasoning for it. + if params.get("supports_reasoning", False) and not params.get("is_lmstudio", False): if is_github_models: gh_reasoning = params.get("github_reasoning_extra") if gh_reasoning is not None: diff --git a/cli-config.yaml.example b/cli-config.yaml.example index d6cb0bcb46..56b925f8ff 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -30,14 +30,13 @@ model: # "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY — https://ollama.com/settings) # "kilocode" - KiloCode gateway (requires: KILOCODE_API_KEY) # "ai-gateway" - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY) + # "lmstudio" - LM Studio local server (optional: LM_API_KEY, defaults to http://127.0.0.1:1234/v1) # # Local servers (LM Studio, Ollama, vLLM, llama.cpp): - # "custom" - Any OpenAI-compatible endpoint. Set base_url below. - # Aliases: "lmstudio", "ollama", "vllm", "llamacpp" all map to "custom". - # Example for LM Studio: - # provider: "lmstudio" - # base_url: "http://localhost:1234/v1" - # No API key needed — local servers typically ignore auth. + # "custom" - Any other OpenAI-compatible endpoint. Set base_url below. + # Aliases: "ollama", "vllm", "llamacpp" all map to "custom". + # LM Studio is first-class and uses provider: "lmstudio". + # It works with both no-auth and auth-enabled server modes. # # Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var. provider: "auto" diff --git a/cli.py b/cli.py index dc73c8f089..33a4f585e2 100644 --- a/cli.py +++ b/cli.py @@ -5459,6 +5459,8 @@ class HermesCLI: try: providers = list_authenticated_providers( current_provider=self.provider or "", + current_base_url=self.base_url or "", + current_model=self.model or "", user_providers=user_provs, custom_providers=custom_provs, max_models=50, diff --git a/gateway/run.py b/gateway/run.py index 4d6f5b86ad..c759cb4d3f 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -6169,6 +6169,7 @@ class GatewayRunner: providers = list_authenticated_providers( current_provider=current_provider, current_base_url=current_base_url, + current_model=current_model, user_providers=user_provs, custom_providers=custom_provs, max_models=50, @@ -6290,6 +6291,7 @@ class GatewayRunner: providers = list_authenticated_providers( current_provider=current_provider, current_base_url=current_base_url, + current_model=current_model, user_providers=user_provs, custom_providers=custom_provs, max_models=5, diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py index 61a5760dcb..c5ff23e586 100644 --- a/hermes_cli/auth.py +++ b/hermes_cli/auth.py @@ -110,6 +110,12 @@ SERVICE_PROVIDER_NAMES: Dict[str, str] = { DEFAULT_GEMINI_CLOUDCODE_BASE_URL = "cloudcode-pa://google" GEMINI_OAUTH_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 60 # refresh 60s before expiry +# LM Studio's default no-auth mode still requires *some* non-empty bearer for +# the API-key code paths (auxiliary_client, runtime resolver) to treat the +# provider as configured. This sentinel is sent only to LM Studio, never to +# any remote service. +LMSTUDIO_NOAUTH_PLACEHOLDER = "dummy-lm-api-key" + # ============================================================================= # Provider Registry @@ -160,6 +166,14 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = { auth_type="oauth_external", inference_base_url=DEFAULT_GEMINI_CLOUDCODE_BASE_URL, ), + "lmstudio": ProviderConfig( + id="lmstudio", + name="LM Studio", + auth_type="api_key", + inference_base_url="http://127.0.0.1:1234/v1", + api_key_env_vars=("LM_API_KEY",), + base_url_env_var="LM_BASE_URL", + ), "copilot": ProviderConfig( id="copilot", name="GitHub Copilot", @@ -1155,8 +1169,8 @@ def resolve_provider( "aws": "bedrock", "aws-bedrock": "bedrock", "amazon-bedrock": "bedrock", "amazon": "bedrock", "go": "opencode-go", "opencode-go-sub": "opencode-go", "kilo": "kilocode", "kilo-code": "kilocode", "kilo-gateway": "kilocode", + "lmstudio": "lmstudio", "lm-studio": "lmstudio", "lm_studio": "lmstudio", # Local server aliases — route through the generic custom provider - "lmstudio": "custom", "lm-studio": "custom", "lm_studio": "custom", "ollama": "custom", "ollama_cloud": "ollama-cloud", "vllm": "custom", "llamacpp": "custom", "llama.cpp": "custom", "llama-cpp": "custom", @@ -1203,8 +1217,11 @@ def resolve_provider( continue # GitHub tokens are commonly present for repo/tool access but should not # hijack inference auto-selection unless the user explicitly chooses - # Copilot/GitHub Models as the provider. - if pid == "copilot": + # Copilot/GitHub Models as the provider. LM Studio is a local server + # whose availability isn't implied by LM_API_KEY presence (it may be + # offline, and the no-auth setup uses a placeholder value), so it + # also requires explicit selection. + if pid in ("copilot", "lmstudio"): continue for env_var in pconfig.api_key_env_vars: if has_usable_secret(os.getenv(env_var, "")): @@ -3482,6 +3499,13 @@ def resolve_api_key_provider_credentials(provider_id: str) -> Dict[str, Any]: key_source = "" api_key, key_source = _resolve_api_key_provider_secret(provider_id, pconfig) + # No-auth LM Studio: substitute a placeholder so runtime / auxiliary_client + # see the local server as configured. doctor still reports unconfigured + # because get_api_key_provider_status uses the raw secret resolver. + if not api_key and provider_id == "lmstudio": + api_key = LMSTUDIO_NOAUTH_PLACEHOLDER + key_source = key_source or "default" + env_url = "" if pconfig.base_url_env_var: env_url = os.getenv(pconfig.base_url_env_var, "").strip() diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index d83f2ac9eb..5c732720e5 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -946,6 +946,42 @@ def slack_subcommand_map() -> dict[str, str]: # Autocomplete # --------------------------------------------------------------------------- + +# Per-process cache for /model LM Studio autocomplete. Probing on +# every keystroke would block the UI; a short TTL keeps it live without +# hammering the server. +_LMSTUDIO_COMPLETION_CACHE: tuple[float, list[str]] | None = None + + +def _lmstudio_completion_models() -> list[str]: + """Locally-loaded LM Studio models for /model autocomplete (cached, gated).""" + global _LMSTUDIO_COMPLETION_CACHE + # Gate: don't probe 127.0.0.1 on every keystroke for users who don't use LM Studio. + if not (os.environ.get("LM_API_KEY") or os.environ.get("LM_BASE_URL")): + try: + from hermes_cli.auth import _load_auth_store + store = _load_auth_store() or {} + if "lmstudio" not in (store.get("providers") or {}) \ + and "lmstudio" not in (store.get("credential_pool") or {}): + return [] + except Exception: + return [] + now = time.time() + if _LMSTUDIO_COMPLETION_CACHE and (now - _LMSTUDIO_COMPLETION_CACHE[0]) < 30.0: + return _LMSTUDIO_COMPLETION_CACHE[1] + try: + from hermes_cli.models import fetch_lmstudio_models + models = fetch_lmstudio_models( + api_key=os.environ.get("LM_API_KEY", ""), + base_url=os.environ.get("LM_BASE_URL") or "http://127.0.0.1:1234/v1", + timeout=0.8, + ) + except Exception: + models = [] + _LMSTUDIO_COMPLETION_CACHE = (now, models) + return models + + class SlashCommandCompleter(Completer): """Autocomplete for built-in slash commands, subcommands, and skill commands.""" @@ -1369,6 +1405,19 @@ class SlashCommandCompleter(Completer): ) except Exception: pass + # LM Studio: surface locally-loaded models. Gated on the user actually + # having LM Studio configured (env var or auth-store entry) so we + # don't probe 127.0.0.1 on every keystroke for users who don't use it. + for name in _lmstudio_completion_models(): + if name in seen: + continue + if name.startswith(sub_lower) and name != sub_lower: + yield Completion( + name, + start_position=-len(sub_text), + display=name, + display_meta="LM Studio", + ) def get_completions(self, document, complete_event): text = document.text_before_cursor diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 7291bfe330..0c3b39393c 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1123,7 +1123,7 @@ DEFAULT_CONFIG = { }, # Config schema version - bump this when adding new required fields - "_config_version": 22, + "_config_version": 23, } # ============================================================================= @@ -1223,6 +1223,22 @@ OPTIONAL_ENV_VARS = { "category": "provider", "advanced": True, }, + "LM_API_KEY": { + "description": "LM Studio bearer token for auth-enabled local servers", + "prompt": "LM Studio API key / bearer token", + "url": None, + "password": True, + "category": "provider", + "advanced": True, + }, + "LM_BASE_URL": { + "description": "LM Studio base URL override", + "prompt": "LM Studio base URL (leave empty for default)", + "url": None, + "password": False, + "category": "provider", + "advanced": True, + }, "GLM_API_KEY": { "description": "Z.AI / GLM API key (also recognized as ZAI_API_KEY / Z_AI_API_KEY)", "prompt": "Z.AI / GLM API key", @@ -3107,6 +3123,28 @@ def migrate_config(interactive: bool = True, quiet: bool = False) -> Dict[str, A "Use `hermes plugins enable ` to activate." ) + # ── Version 22 → 23: ensure LM_API_KEY is set when provider is lmstudio ── + # LM Studio's documented default is no-auth, but our API-key registry + # path needs *some* non-empty value to satisfy auxiliary_client and + # runtime resolution. Self-heal users whose config.yaml has + # provider:lmstudio but no LM_API_KEY in .env (cross-machine sync, + # manual edit, profile move). + if current_ver < 23: + try: + from hermes_cli.auth import LMSTUDIO_NOAUTH_PLACEHOLDER + config = load_config() + model_cfg = config.get("model") + if isinstance(model_cfg, dict) and str(model_cfg.get("provider") or "").strip().lower() == "lmstudio": + if not get_env_value("LM_API_KEY"): + save_env_value("LM_API_KEY", LMSTUDIO_NOAUTH_PLACEHOLDER) + results["env_added"].append( + f"LM_API_KEY={LMSTUDIO_NOAUTH_PLACEHOLDER} (placeholder for no-auth LM Studio)" + ) + if not quiet: + print(" ✓ Added placeholder LM_API_KEY for LM Studio (no-auth default)") + except Exception: + pass + if current_ver < latest_ver and not quiet: print(f"Config version: {current_ver} → {latest_ver}") @@ -3806,7 +3844,7 @@ def save_env_value(key: str, value: str): value = _check_non_ascii_credential(key, value) ensure_hermes_home() env_path = get_env_path() - + # On Windows, open() defaults to the system locale (cp1252) which can # cause OSError errno 22 on UTF-8 .env files. read_kw = {"encoding": "utf-8", "errors": "replace"} if _IS_WINDOWS else {} @@ -3818,7 +3856,7 @@ def save_env_value(key: str, value: str): lines = f.readlines() # Sanitize on every read: split concatenated keys, drop stale placeholders lines = _sanitize_env_lines(lines) - + # Find and update or append found = False for i, line in enumerate(lines): @@ -3826,7 +3864,7 @@ def save_env_value(key: str, value: str): lines[i] = f"{key}={value}\n" found = True break - + if not found: # Ensure there's a newline at the end of the file before appending if lines and not lines[-1].endswith("\n"): diff --git a/hermes_cli/doctor.py b/hermes_cli/doctor.py index dd9d2d164a..e49844bac2 100644 --- a/hermes_cli/doctor.py +++ b/hermes_cli/doctor.py @@ -344,7 +344,7 @@ def run_doctor(args): ) # Warn if model is set to a provider-prefixed name on a provider that doesn't use them - if default_model and "/" in default_model and canonical_provider and canonical_provider not in ("openrouter", "custom", "auto", "ai-gateway", "kilocode", "opencode-zen", "huggingface", "nous"): + if default_model and "/" in default_model and canonical_provider and canonical_provider not in ("openrouter", "custom", "auto", "ai-gateway", "kilocode", "opencode-zen", "huggingface", "nous", "lmstudio"): check_warn( f"model.default '{default_model}' uses a vendor/model slug but provider is '{provider_raw}'", "(vendor-prefixed slugs belong to aggregators like openrouter)", diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 2ec8e44481..607883d596 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -1821,6 +1821,7 @@ def select_provider_and_model(args=None): "nvidia", "ollama-cloud", "tencent-tokenhub", + "lmstudio", ): _model_flow_api_key_provider(config, selected_provider, current_model) @@ -2047,7 +2048,11 @@ def _aux_select_for_task(task: str) -> None: # Gather authenticated providers (has credentials + curated model list) try: - providers = list_authenticated_providers(current_provider=current_provider) + providers = list_authenticated_providers( + current_provider=current_provider, + current_model=current_model, + current_base_url=current_base_url, + ) except Exception as exc: print(f"Could not detect authenticated providers: {exc}") providers = [] @@ -4377,6 +4382,7 @@ def _model_flow_bedrock(config, current_model=""): def _model_flow_api_key_provider(config, provider_id, current_model=""): """Generic flow for API-key providers (z.ai, MiniMax, OpenCode, etc.).""" from hermes_cli.auth import ( + LMSTUDIO_NOAUTH_PLACEHOLDER, PROVIDER_REGISTRY, _prompt_model_selection, _save_model_choice, @@ -4411,13 +4417,20 @@ def _model_flow_api_key_provider(config, provider_id, current_model=""): try: import getpass - new_key = getpass.getpass(f"{key_env} (or Enter to cancel): ").strip() + if provider_id == "lmstudio": + prompt = f"{key_env} (Enter for no-auth default {LMSTUDIO_NOAUTH_PLACEHOLDER!r}): " + else: + prompt = f"{key_env} (or Enter to cancel): " + new_key = getpass.getpass(prompt).strip() except (KeyboardInterrupt, EOFError): print() return if not new_key: - print("Cancelled.") - return + if provider_id == "lmstudio": + new_key = LMSTUDIO_NOAUTH_PLACEHOLDER + else: + print("Cancelled.") + return save_env_value(key_env, new_key) existing_key = new_key print("API key saved.") @@ -4484,10 +4497,21 @@ def _model_flow_api_key_provider(config, provider_id, current_model=""): print(" Tier check: could not verify (proceeding anyway).") print() - # Optional base URL override + # Optional base URL override. + # Precedence: env var → config.yaml model.base_url → registry default. + # Reading config.yaml prevents silently overwriting a saved remote URL + # (e.g. a remote LM Studio endpoint) with localhost when the user just + # presses Enter at the prompt below. current_base = "" if base_url_env: current_base = get_env_value(base_url_env) or os.getenv(base_url_env, "") + if not current_base: + try: + _m = load_config().get("model") or {} + if str(_m.get("provider") or "").strip().lower() == provider_id: + current_base = str(_m.get("base_url") or "").strip() + except Exception: + pass effective_base = current_base or pconfig.inference_base_url try: @@ -4509,8 +4533,22 @@ def _model_flow_api_key_provider(config, provider_id, current_model=""): # 2. Curated static fallback list (offline insurance) # 3. Live /models endpoint probe (small providers without models.dev data) # - # Ollama Cloud: dedicated merged discovery (live API + models.dev + disk cache) - if provider_id == "ollama-cloud": + # LM Studio: live /api/v1/models probe (no models.dev catalog). + # Ollama Cloud: merged discovery (live API + models.dev + disk cache). + if provider_id == "lmstudio": + from hermes_cli.auth import AuthError + from hermes_cli.models import fetch_lmstudio_models + + api_key_for_probe = existing_key or (get_env_value(key_env) if key_env else "") + try: + model_list = fetch_lmstudio_models(api_key=api_key_for_probe, base_url=effective_base) + except AuthError as exc: + print(f" LM Studio rejected the request: {exc}") + print(" Set LM_API_KEY (or update it) to match the server's bearer token.") + model_list = [] + if model_list: + print(f" Found {len(model_list)} model(s) from LM Studio") + elif provider_id == "ollama-cloud": from hermes_cli.models import fetch_ollama_cloud_models api_key_for_probe = existing_key or (get_env_value(key_env) if key_env else "") diff --git a/hermes_cli/model_switch.py b/hermes_cli/model_switch.py index d5a118f2db..869d82bf6d 100644 --- a/hermes_cli/model_switch.py +++ b/hermes_cli/model_switch.py @@ -984,6 +984,7 @@ def list_authenticated_providers( user_providers: dict = None, custom_providers: list | None = None, max_models: int = 8, + current_model: str = "", ) -> List[dict]: """Detect which providers have credentials and list their curated models. @@ -1030,6 +1031,34 @@ def list_authenticated_providers( if "ollama-cloud" not in curated: from hermes_cli.models import fetch_ollama_cloud_models curated["ollama-cloud"] = fetch_ollama_cloud_models() + # LM Studio has no static catalog — probe its native /api/v1/models + # endpoint live so the picker reflects whatever the user has loaded. + # Base URL precedence: LM_BASE_URL env var > active config's base_url + # (when current provider is lmstudio) > 127.0.0.1 default. + # On auth rejection or unreachable server, fall back to the caller-supplied + # current model so the picker still shows something when offline / mis-keyed. + if "lmstudio" not in curated and ( + os.environ.get("LM_API_KEY") or os.environ.get("LM_BASE_URL") or current_provider.strip().lower() == "lmstudio" + ): + from hermes_cli.models import fetch_lmstudio_models + from hermes_cli.auth import AuthError + is_current_lmstudio = current_provider.strip().lower() == "lmstudio" + lm_base = ( + os.environ.get("LM_BASE_URL") + or (current_base_url if is_current_lmstudio and current_base_url else None) + or "http://127.0.0.1:1234/v1" + ) + try: + live = fetch_lmstudio_models( + api_key=os.environ.get("LM_API_KEY", ""), + base_url=lm_base, + timeout=1.5, # Smaller timeout for picker + ) + except AuthError: + live = [] + if not live and is_current_lmstudio and current_model: + live = [current_model] + curated["lmstudio"] = live # --- 1. Check Hermes-mapped providers --- for hermes_id, mdev_id in PROVIDER_TO_MODELS_DEV.items(): diff --git a/hermes_cli/models.py b/hermes_cli/models.py index 320c8f97f4..bd590a5576 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -768,6 +768,7 @@ class ProviderEntry(NamedTuple): CANONICAL_PROVIDERS: list[ProviderEntry] = [ ProviderEntry("nous", "Nous Portal", "Nous Portal (Nous Research subscription)"), ProviderEntry("openrouter", "OpenRouter", "OpenRouter (100+ models, pay-per-use)"), + ProviderEntry("lmstudio", "LM Studio", "LM Studio (local desktop app with built-in model server)"), ProviderEntry("ai-gateway", "Vercel AI Gateway", "Vercel AI Gateway (200+ models, $5 free credit, no markup)"), ProviderEntry("anthropic", "Anthropic", "Anthropic (Claude models — API key or Claude Code)"), ProviderEntry("openai-codex", "OpenAI Codex", "OpenAI Codex"), @@ -870,6 +871,9 @@ _PROVIDER_ALIASES = { "nvidia-nim": "nvidia", "build-nvidia": "nvidia", "nemotron": "nvidia", + "lmstudio": "lmstudio", + "lm-studio": "lmstudio", + "lm_studio": "lmstudio", "ollama": "custom", # bare "ollama" = local; use "ollama-cloud" for cloud "ollama_cloud": "ollama-cloud", } @@ -2195,6 +2199,225 @@ def _is_github_models_base_url(base_url: Optional[str]) -> bool: ) +def probe_lmstudio_models( + api_key: Optional[str] = None, + base_url: Optional[str] = None, + timeout: float = 5.0, +) -> Optional[list[str]]: + """Probe LM Studio's model listing. + + Returns chat-capable model keys on success, including the valid empty-list + case when the server is reachable but has no non-embedding models. + Returns ``None`` on network errors, malformed responses, or empty/invalid + base URLs. + + Raises ``AuthError`` on HTTP 401/403 so callers can surface token issues + separately from reachability problems. + """ + server_root = (base_url or "").strip().rstrip("/") + if server_root.endswith("/v1"): + server_root = server_root[:-3].rstrip("/") + if not server_root: + return None + + headers = {"User-Agent": _HERMES_USER_AGENT} + token = str(api_key or "").strip() + if token: + headers["Authorization"] = f"Bearer {token}" + request = urllib.request.Request(server_root + "/api/v1/models", headers=headers) + try: + with urllib.request.urlopen(request, timeout=timeout) as resp: + payload = json.loads(resp.read().decode()) + except urllib.error.HTTPError as exc: + if exc.code in (401, 403): + from hermes_cli.auth import AuthError + raise AuthError( + f"LM Studio rejected the request with HTTP {exc.code}.", + provider="lmstudio", + code="auth_rejected", + ) from exc + import logging + logging.getLogger(__name__).debug( + "LM Studio probe at %s failed with HTTP %s", server_root, exc.code, + ) + return None + except Exception as exc: + import logging + logging.getLogger(__name__).debug( + "LM Studio probe at %s failed: %s", server_root, exc, + ) + return None + + raw_models = payload.get("models") if isinstance(payload, dict) else None + if not isinstance(raw_models, list): + import logging + logging.getLogger(__name__).debug( + "LM Studio probe at %s returned malformed payload (no `models` list)", + server_root, + ) + return None + + keys: list[str] = [] + for raw in raw_models: + if not isinstance(raw, dict): + continue + if str(raw.get("type") or "").strip().lower() == "embedding": + continue + key = str(raw.get("key") or raw.get("id") or "").strip() + if key and key not in keys: + keys.append(key) + return keys + + +def fetch_lmstudio_models( + api_key: Optional[str] = None, + base_url: Optional[str] = None, + timeout: float = 5.0, +) -> list[str]: + """Fetch LM Studio chat-capable model keys from native ``/api/v1/models``. + + Returns a list of model keys (e.g. ``publisher/model-name``) with embedding + models filtered out. Returns an empty list on network errors, malformed + responses, or empty/invalid base URLs. + + Raises ``AuthError`` on HTTP 401/403 so callers can distinguish a missing + or wrong ``LM_API_KEY`` from an unreachable server — the most common + LM Studio support case once auth-enabled mode is turned on. + """ + models = probe_lmstudio_models(api_key=api_key, base_url=base_url, timeout=timeout) + return models or [] + + +def ensure_lmstudio_model_loaded( + model: str, + base_url: Optional[str], + api_key: Optional[str], + target_context_length: int, + timeout: float = 120.0, +) -> Optional[int]: + """Ensure LM Studio has ``model`` loaded with at least ``target_context_length``. + + No-op when an instance is already loaded with sufficient context. Otherwise + POSTs ``/api/v1/models/load`` to (re)load with the target context, capped + at the model's ``max_context_length``. Returns the resolved loaded context + length, or ``None`` when the probe / load failed. + """ + server_root = (base_url or "").strip().rstrip("/") + if server_root.endswith("/v1"): + server_root = server_root[:-3].rstrip("/") + if not server_root: + return None + + headers = {"User-Agent": _HERMES_USER_AGENT} + token = str(api_key or "").strip() + if token: + headers["Authorization"] = f"Bearer {token}" + + try: + with urllib.request.urlopen( + urllib.request.Request(server_root + "/api/v1/models", headers=headers), + timeout=10, + ) as resp: + payload = json.loads(resp.read().decode()) + except Exception: + return None + + raw_models = payload.get("models") if isinstance(payload, dict) else None + if not isinstance(raw_models, list): + return None + + target_entry = None + for raw in raw_models: + if not isinstance(raw, dict): + continue + if raw.get("key") == model or raw.get("id") == model: + target_entry = raw + break + if target_entry is None: + return None + + max_ctx = target_entry.get("max_context_length") + if isinstance(max_ctx, int) and max_ctx > 0: + target_context_length = min(target_context_length, max_ctx) + + for inst in target_entry.get("loaded_instances") or []: + cfg = inst.get("config") if isinstance(inst, dict) else None + loaded_ctx = cfg.get("context_length") if isinstance(cfg, dict) else None + if isinstance(loaded_ctx, int) and loaded_ctx >= target_context_length: + return loaded_ctx + + body = json.dumps({ + "model": model, + "context_length": target_context_length, + }).encode() + load_headers = dict(headers) + load_headers["Content-Type"] = "application/json" + try: + with urllib.request.urlopen( + urllib.request.Request( + server_root + "/api/v1/models/load", + data=body, + headers=load_headers, + method="POST", + ), + timeout=timeout, + ) as resp: + resp.read() + except Exception: + return None + return target_context_length + + +def lmstudio_model_reasoning_options( + model: str, + base_url: Optional[str], + api_key: Optional[str] = None, + timeout: float = 5.0, +) -> list[str]: + """Return the reasoning ``allowed_options`` LM Studio publishes for ``model``. + + Pulls ``capabilities.reasoning.allowed_options`` from ``/api/v1/models``. + Returns ``[]`` when the model is unknown, the endpoint is unreachable, + or the model does not declare a reasoning capability. + """ + server_root = (base_url or "").strip().rstrip("/") + if server_root.endswith("/v1"): + server_root = server_root[:-3].rstrip("/") + if not server_root: + return [] + + headers = {"User-Agent": _HERMES_USER_AGENT} + token = str(api_key or "").strip() + if token: + headers["Authorization"] = f"Bearer {token}" + + try: + with urllib.request.urlopen( + urllib.request.Request(server_root + "/api/v1/models", headers=headers), + timeout=timeout, + ) as resp: + payload = json.loads(resp.read().decode()) + except Exception: + return [] + + raw_models = payload.get("models") if isinstance(payload, dict) else None + if not isinstance(raw_models, list): + return [] + + for raw in raw_models: + if not isinstance(raw, dict): + continue + if raw.get("key") != model and raw.get("id") != model: + continue + caps = raw.get("capabilities") + reasoning = caps.get("reasoning") if isinstance(caps, dict) else None + opts = reasoning.get("allowed_options") if isinstance(reasoning, dict) else None + if isinstance(opts, list): + return [str(o).strip().lower() for o in opts if isinstance(o, str)] + return [] + return [] + + def _fetch_github_models(api_key: Optional[str] = None, timeout: float = 5.0) -> Optional[list[str]]: catalog = fetch_github_model_catalog(api_key=api_key, timeout=timeout) if not catalog: @@ -2790,6 +3013,40 @@ def validate_requested_model( "message": "Model names cannot contain spaces.", } + if normalized == "lmstudio": + from hermes_cli.auth import AuthError + # Use probe_lmstudio_models so we can distinguish None (unreachable + # / malformed response) from [] (reachable, but no chat-capable models + # are loaded). fetch_lmstudio_models collapses both to []. + try: + models = probe_lmstudio_models(api_key=api_key, base_url=base_url) + except AuthError as exc: + return { + "accepted": False, "persist": False, "recognized": False, + "message": ( + f"{exc} Set `LM_API_KEY` (or update it) to match the server's bearer token." + ), + } + if models is None: + return { + "accepted": False, "persist": False, "recognized": False, + "message": f"Could not reach LM Studio's `/api/v1/models` to validate `{requested}`.", + } + if not models: + return { + "accepted": False, "persist": False, "recognized": False, + "message": ( + f"LM Studio is reachable but no chat-capable models are loaded. " + f"Load `{requested}` in LM Studio (Developer tab → Load Model) and try again." + ), + } + if requested_for_lookup in set(models): + return {"accepted": True, "persist": True, "recognized": True, "message": None} + return { + "accepted": False, "persist": False, "recognized": False, + "message": f"Model `{requested}` was not found in LM Studio's model listing.", + } + if normalized == "custom": # Try probing with correct auth for the api_mode. if api_mode == "anthropic_messages": diff --git a/hermes_cli/providers.py b/hermes_cli/providers.py index 5620250e0f..60f8dd8eaa 100644 --- a/hermes_cli/providers.py +++ b/hermes_cli/providers.py @@ -71,6 +71,13 @@ HERMES_OVERLAYS: Dict[str, HermesOverlay] = { auth_type="oauth_external", base_url_override="cloudcode-pa://google", ), + "lmstudio": HermesOverlay( + transport="openai_chat", + auth_type="api_key", + extra_env_vars=("LM_API_KEY",), + base_url_override="http://127.0.0.1:1234/v1", + base_url_env_var="LM_BASE_URL", + ), "copilot-acp": HermesOverlay( transport="codex_responses", auth_type="external_process", @@ -345,6 +352,7 @@ _LABEL_OVERRIDES: Dict[str, str] = { "xiaomi": "Xiaomi MiMo", "gmi": "GMI Cloud", "tencent-tokenhub": "Tencent TokenHub", + "lmstudio": "LM Studio", "local": "Local endpoint", "bedrock": "AWS Bedrock", "ollama-cloud": "Ollama Cloud", diff --git a/hermes_cli/runtime_provider.py b/hermes_cli/runtime_provider.py index e2883c883f..a64be10b11 100644 --- a/hermes_cli/runtime_provider.py +++ b/hermes_cli/runtime_provider.py @@ -1245,14 +1245,20 @@ def resolve_runtime_provider( if pconfig and pconfig.auth_type == "api_key": creds = resolve_api_key_provider_credentials(provider) # Honour model.base_url from config.yaml when the configured provider - # matches this provider — mirrors the Anthropic path above. Without - # this, users who set model.base_url to e.g. api.minimaxi.com/anthropic - # (China endpoint) still get the hardcoded api.minimax.io default (#6039). + # matches this provider, unless the provider-specific BASE_URL env var + # is set. That keeps temporary env overrides (e.g. LM_BASE_URL) in sync + # with picker-time probing while still preserving saved config URLs when + # no override is present. cfg_provider = str(model_cfg.get("provider") or "").strip().lower() cfg_base_url = "" if cfg_provider == provider: cfg_base_url = (model_cfg.get("base_url") or "").strip().rstrip("/") - base_url = cfg_base_url or creds.get("base_url", "").rstrip("/") + env_base_url = "" + if pconfig.base_url_env_var: + env_base_url = os.getenv(pconfig.base_url_env_var, "").strip().rstrip("/") + base_url = creds.get("base_url", "").rstrip("/") + if cfg_base_url and not env_base_url: + base_url = cfg_base_url api_mode = "chat_completions" if provider == "copilot": api_mode = _copilot_runtime_api_mode(model_cfg, creds.get("api_key", "")) diff --git a/hermes_cli/status.py b/hermes_cli/status.py index b4a6101885..f02f5f2672 100644 --- a/hermes_cli/status.py +++ b/hermes_cli/status.py @@ -274,6 +274,23 @@ def show_status(args): label = "configured" if configured else "not configured (run: hermes model)" print(f" {pname:<16} {check_mark(configured)} {label}") + # LM Studio reachability — only probe when it's the active provider so + # users with foreign configs don't see noise. Auth rejection vs. silent + # empty list is the most common LM Studio support case. + if _effective_provider_label() == "LM Studio": + from hermes_cli.models import probe_lmstudio_models + model_cfg = config.get("model") + base = (model_cfg.get("base_url") if isinstance(model_cfg, dict) else None) or get_env_value("LM_BASE_URL") or "http://127.0.0.1:1234/v1" + try: + models = probe_lmstudio_models(api_key=get_env_value("LM_API_KEY") or "", base_url=base, timeout=1.5) + if models is None: + ok, msg = False, f"unreachable at {base}" + else: + ok, msg = True, f"reachable ({len(models)} model(s)) at {base}" + except AuthError: + ok, msg = False, "auth rejected — set LM_API_KEY" + print(f" {'LM Studio':<16} {check_mark(ok)} {msg}") + # ========================================================================= # Terminal Configuration # ========================================================================= diff --git a/run_agent.py b/run_agent.py index 802a1a4099..65be5add9a 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1826,9 +1826,6 @@ class AIAgent: ) _config_context_length = None - # Store for reuse in switch_model (so config override persists across model switches) - self._config_context_length = _config_context_length - # Resolve custom_providers list once for reuse below (startup # context-length override and plugin context-engine init). try: @@ -1887,7 +1884,14 @@ class AIAgent: file=sys.stderr, ) break - + + # Persist for reuse on switch_model / fallback activation. Must come + # AFTER the custom_providers branch so per-model overrides aren't lost. + self._config_context_length = _config_context_length + + self._ensure_lmstudio_runtime_loaded(_config_context_length) + + # Select context engine: config-driven (like memory providers). # 1. Check config.yaml context.engine setting # 2. Check plugins/context_engine// directory (repo-shipped) @@ -2129,6 +2133,24 @@ class AIAgent: if hasattr(self, "context_compressor") and self.context_compressor: self.context_compressor.on_session_reset() + def _ensure_lmstudio_runtime_loaded(self, config_context_length: Optional[int] = None) -> None: + """ + Preload the LM Studio model with at least Hermes' minimum context. + """ + if (self.provider or "").strip().lower() != "lmstudio": + return + try: + from agent.model_metadata import MINIMUM_CONTEXT_LENGTH + from hermes_cli.models import ensure_lmstudio_model_loaded + if config_context_length is None: + config_context_length = getattr(self, "_config_context_length", None) + target_ctx = max(config_context_length or 0, MINIMUM_CONTEXT_LENGTH) + ensure_lmstudio_model_loaded( + self.model, self.base_url, getattr(self, "api_key", ""), target_ctx, + ) + except Exception as err: + logger.debug("LM Studio preload skipped: %s", err) + def switch_model(self, new_model, new_provider, api_key='', base_url='', api_mode=''): """Switch the model/provider in-place for a live agent. @@ -2224,6 +2246,9 @@ class AIAgent: ) ) + # ── LM Studio: preload before probing context length ── + self._ensure_lmstudio_runtime_loaded() + # ── Update context compressor ── if hasattr(self, "context_compressor") and self.context_compressor: from agent.model_metadata import get_model_context_length @@ -7327,6 +7352,9 @@ class AIAgent: ) ) + # LM Studio: preload before probing the fallback's context length. + self._ensure_lmstudio_runtime_loaded() + # Update context compressor limits for the fallback model. # Without this, compression decisions use the primary model's # context window (e.g. 200K) instead of the fallback's (e.g. 32K), @@ -8047,6 +8075,7 @@ class AIAgent: or base_url_host_matches(self.base_url, "moonshot.cn") ) _is_tokenhub = base_url_host_matches(self._base_url_lower, "tokenhub.tencentmaas.com") + _is_lmstudio = (self.provider or "").strip().lower() == "lmstudio" # Temperature: _fixed_temperature_for_model may return OMIT_TEMPERATURE # sentinel (temperature omitted entirely), a numeric override, or None. @@ -8119,6 +8148,7 @@ class AIAgent: is_nvidia_nim=_is_nvidia, is_kimi=_is_kimi, is_tokenhub=_is_tokenhub, + is_lmstudio=_is_lmstudio, is_custom_provider=self.provider == "custom", ollama_num_ctx=self._ollama_num_ctx, provider_preferences=_prefs or None, @@ -8129,6 +8159,7 @@ class AIAgent: omit_temperature=_omit_temp, supports_reasoning=self._supports_reasoning_extra_body(), github_reasoning_extra=self._github_models_reasoning_extra_body() if _is_gh else None, + lmstudio_reasoning_options=self._lmstudio_reasoning_options_cached() if _is_lmstudio else None, anthropic_max_output=_ant_max, provider_name=self.provider, ) @@ -8154,6 +8185,10 @@ class AIAgent: return bool(github_model_reasoning_efforts(self.model)) except Exception: return False + if (self.provider or "").strip().lower() == "lmstudio": + opts = self._lmstudio_reasoning_options_cached() + # "off-only" (or absent) means no real reasoning capability. + return any(opt and opt != "off" for opt in opts) if "openrouter" not in self._base_url_lower: return False if "api.mistral.ai" in self._base_url_lower: @@ -8171,6 +8206,48 @@ class AIAgent: ) return any(model.startswith(prefix) for prefix in reasoning_model_prefixes) + def _lmstudio_reasoning_options_cached(self) -> list[str]: + """Probe LM Studio's published reasoning ``allowed_options`` once per + (model, base_url). The list (e.g. ``["off","on"]`` or + ``["off","minimal","low"]``) is needed both for the supports-reasoning + gate and for clamping the emitted ``reasoning_effort`` so toggle-style + models don't 400 on ``high``. Cache is keyed on (model, base_url) so + ``/model`` swaps and base-URL changes don't reuse a stale list, and an + empty result (transient probe failure) is *not* cached so the next call + retries instead of silently disabling reasoning for the rest of the + session. + """ + cache = getattr(self, "_lm_reasoning_opts_cache", None) + if cache is None: + cache = self._lm_reasoning_opts_cache = {} + key = (self.model, self.base_url) + cached = cache.get(key) + if cached: + return cached + try: + from hermes_cli.models import lmstudio_model_reasoning_options + opts = lmstudio_model_reasoning_options( + self.model, self.base_url, getattr(self, "api_key", ""), + ) + except Exception: + opts = [] + if opts: + cache[key] = opts + return opts + + def _resolve_lmstudio_summary_reasoning_effort(self) -> Optional[str]: + """Resolve a safe top-level ``reasoning_effort`` for LM Studio. + + The iteration-limit summary path calls ``chat.completions.create()`` + directly, bypassing the transport. Share the helper so the two paths + can't drift on effort resolution and clamping. + """ + from agent.lmstudio_reasoning import resolve_lmstudio_effort + return resolve_lmstudio_effort( + self.reasoning_config, + self._lmstudio_reasoning_options_cached(), + ) + def _github_models_reasoning_extra_body(self) -> dict | None: """Format reasoning payload for GitHub Models/OpenAI-compatible routes.""" try: @@ -9692,7 +9769,19 @@ class AIAgent: _omit_summary_temperature = _raw_summary_temp is _OMIT_TEMP _summary_temperature = None if _omit_summary_temperature else _raw_summary_temp _is_nous = "nousresearch" in self._base_url_lower - if self._supports_reasoning_extra_body(): + # LM Studio uses top-level `reasoning_effort` (not extra_body.reasoning). + # Mirror ChatCompletionsTransport.build_kwargs() so the summary path + # — which calls chat.completions.create() directly without going + # through the transport — sends the same shape the transport does. + _is_lmstudio_summary = ( + (self.provider or "").strip().lower() == "lmstudio" + and self._supports_reasoning_extra_body() + ) + _lm_reasoning_effort: str | None = ( + self._resolve_lmstudio_summary_reasoning_effort() + if _is_lmstudio_summary else None + ) + if not _is_lmstudio_summary and self._supports_reasoning_extra_body(): if self.reasoning_config is not None: summary_extra_body["reasoning"] = self.reasoning_config else: @@ -9719,6 +9808,8 @@ class AIAgent: summary_kwargs["temperature"] = _summary_temperature if self.max_tokens is not None: summary_kwargs.update(self._max_tokens_param(self.max_tokens)) + if _lm_reasoning_effort is not None: + summary_kwargs["reasoning_effort"] = _lm_reasoning_effort # Include provider routing preferences provider_preferences = {} @@ -9784,6 +9875,8 @@ class AIAgent: summary_kwargs["temperature"] = _summary_temperature if self.max_tokens is not None: summary_kwargs.update(self._max_tokens_param(self.max_tokens)) + if _lm_reasoning_effort is not None: + summary_kwargs["reasoning_effort"] = _lm_reasoning_effort if summary_extra_body: summary_kwargs["extra_body"] = summary_extra_body diff --git a/tests/agent/transports/test_chat_completions.py b/tests/agent/transports/test_chat_completions.py index 69326887af..e558fa3de7 100644 --- a/tests/agent/transports/test_chat_completions.py +++ b/tests/agent/transports/test_chat_completions.py @@ -376,6 +376,80 @@ class TestChatCompletionsKimi: assert "type" not in kw["tools"][0]["function"]["parameters"]["properties"]["q"] +class TestChatCompletionsLmStudioReasoning: + """LM Studio publishes per-model reasoning ``allowed_options``. When the + user requests an effort the model can't honor (e.g. ``high`` on a + toggle-style ``["off","on"]`` model), the transport omits + ``reasoning_effort`` so LM Studio falls back to the model's default — + silently downgrading "high" to "low" would mislead the user. + """ + + def test_omits_effort_when_high_not_allowed_toggle(self, transport): + kw = transport.build_kwargs( + model="gpt-oss", messages=[{"role": "user", "content": "Hi"}], + is_lmstudio=True, + supports_reasoning=True, + reasoning_config={"effort": "high"}, + lmstudio_reasoning_options=["off", "on"], + ) + assert "reasoning_effort" not in kw + + def test_omits_effort_when_high_not_allowed_minimal_low(self, transport): + kw = transport.build_kwargs( + model="gpt-oss", messages=[{"role": "user", "content": "Hi"}], + is_lmstudio=True, + supports_reasoning=True, + reasoning_config={"effort": "high"}, + lmstudio_reasoning_options=["off", "minimal", "low"], + ) + assert "reasoning_effort" not in kw + + def test_passes_through_when_effort_allowed(self, transport): + kw = transport.build_kwargs( + model="gpt-oss", messages=[{"role": "user", "content": "Hi"}], + is_lmstudio=True, + supports_reasoning=True, + reasoning_config={"effort": "high"}, + lmstudio_reasoning_options=["off", "low", "medium", "high"], + ) + assert kw["reasoning_effort"] == "high" + + def test_passes_through_aliased_on_for_toggle(self, transport): + # User has reasoning enabled at the default "medium"; toggle model + # publishes ["off","on"] which aliases to {"none","medium"}, so the + # default request is honorable and gets sent. + kw = transport.build_kwargs( + model="gpt-oss", messages=[{"role": "user", "content": "Hi"}], + is_lmstudio=True, + supports_reasoning=True, + reasoning_config={"effort": "medium"}, + lmstudio_reasoning_options=["off", "on"], + ) + assert kw["reasoning_effort"] == "medium" + + def test_disabled_keeps_none_when_off_allowed(self, transport): + kw = transport.build_kwargs( + model="gpt-oss", messages=[{"role": "user", "content": "Hi"}], + is_lmstudio=True, + supports_reasoning=True, + reasoning_config={"enabled": False}, + lmstudio_reasoning_options=["off", "on"], + ) + assert kw["reasoning_effort"] == "none" + + def test_no_options_falls_back_to_legacy_behavior(self, transport): + # When the probe failed or returned nothing, allowed_options is unknown; + # send whatever the user picked rather than blocking the request. + kw = transport.build_kwargs( + model="gpt-oss", messages=[{"role": "user", "content": "Hi"}], + is_lmstudio=True, + supports_reasoning=True, + reasoning_config={"effort": "high"}, + lmstudio_reasoning_options=None, + ) + assert kw["reasoning_effort"] == "high" + + class TestChatCompletionsValidate: def test_none(self, transport): diff --git a/tests/hermes_cli/test_api_key_providers.py b/tests/hermes_cli/test_api_key_providers.py index 77afc61705..530075238f 100644 --- a/tests/hermes_cli/test_api_key_providers.py +++ b/tests/hermes_cli/test_api_key_providers.py @@ -145,6 +145,7 @@ class TestProviderRegistry: PROVIDER_ENV_VARS = ( "OPENROUTER_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "ANTHROPIC_TOKEN", "CLAUDE_CODE_OAUTH_TOKEN", + "LM_API_KEY", "LM_BASE_URL", "GLM_API_KEY", "ZAI_API_KEY", "Z_AI_API_KEY", "KIMI_API_KEY", "KIMI_BASE_URL", "STEPFUN_API_KEY", "STEPFUN_BASE_URL", "MINIMAX_API_KEY", "MINIMAX_CN_API_KEY", @@ -428,6 +429,29 @@ class TestResolveApiKeyProviderCredentials: assert creds["base_url"] == "https://api.githubcopilot.com" assert creds["source"] == "gh auth token" + def test_resolve_lmstudio_uses_token_and_base_url_from_env(self, monkeypatch): + monkeypatch.setenv("LM_API_KEY", "lm-token") + monkeypatch.setenv("LM_BASE_URL", "http://lmstudio.remote:4321/v1") + + creds = resolve_api_key_provider_credentials("lmstudio") + + assert creds["provider"] == "lmstudio" + assert creds["api_key"] == "lm-token" + assert creds["base_url"] == "http://lmstudio.remote:4321/v1" + + def test_resolve_lmstudio_no_api_key_substitutes_placeholder(self, monkeypatch): + # No-auth LM Studio: when LM_API_KEY isn't set, runtime credentials + # carry a placeholder so gateway/TUI/cron paths see the local server + # as configured. get_api_key_provider_status still reports unconfigured. + monkeypatch.delenv("LM_API_KEY", raising=False) + monkeypatch.delenv("LM_BASE_URL", raising=False) + + creds = resolve_api_key_provider_credentials("lmstudio") + + assert creds["provider"] == "lmstudio" + assert creds["api_key"] == "dummy-lm-api-key" + assert creds["base_url"] == "http://127.0.0.1:1234/v1" + def test_try_gh_cli_token_uses_homebrew_path_when_not_on_path(self, monkeypatch): monkeypatch.setattr("hermes_cli.copilot_auth.shutil.which", lambda command: None) monkeypatch.setattr( diff --git a/tests/hermes_cli/test_model_provider_persistence.py b/tests/hermes_cli/test_model_provider_persistence.py index 0674836809..2a827ca7ef 100644 --- a/tests/hermes_cli/test_model_provider_persistence.py +++ b/tests/hermes_cli/test_model_provider_persistence.py @@ -260,6 +260,33 @@ class TestProviderPersistsAfterModelSave: assert model.get("default") == "minimax-m2.5" assert model.get("api_mode") == "anthropic_messages" + def test_lmstudio_provider_saved_when_selected(self, config_home, monkeypatch): + from hermes_cli.config import load_config + from hermes_cli.main import _model_flow_api_key_provider + + monkeypatch.setenv("LM_API_KEY", "lm-token") + monkeypatch.setattr( + "hermes_cli.auth._prompt_model_selection", + lambda models, current_model="": "publisher/model-a", + ) + monkeypatch.setattr("hermes_cli.auth.deactivate_provider", lambda: None) + monkeypatch.setattr( + "hermes_cli.models.fetch_lmstudio_models", + lambda api_key=None, base_url=None, timeout=5.0: ["publisher/model-a"], + ) + + with patch("builtins.input", side_effect=[""]): + _model_flow_api_key_provider(load_config(), "lmstudio", "old-model") + + import yaml + + config = yaml.safe_load((config_home / "config.yaml").read_text()) or {} + model = config.get("model") + assert isinstance(model, dict) + assert model.get("provider") == "lmstudio" + assert model.get("base_url") == "http://127.0.0.1:1234/v1" + assert model.get("default") == "publisher/model-a" + class TestBaseUrlValidation: """Reject non-URL values in the base URL prompt (e.g. shell commands).""" diff --git a/tests/hermes_cli/test_model_switch_custom_providers.py b/tests/hermes_cli/test_model_switch_custom_providers.py index 2899172ede..287fce926d 100644 --- a/tests/hermes_cli/test_model_switch_custom_providers.py +++ b/tests/hermes_cli/test_model_switch_custom_providers.py @@ -398,3 +398,84 @@ def test_list_authenticated_providers_total_models_reflects_grouped_count(monkey assert group["total_models"] == 6 # All six models are preserved in the grouped row. assert sorted(group["models"]) == sorted(f"model-{i}" for i in range(6)) + + +def test_lmstudio_picker_probes_active_config_base_url(monkeypatch): + """When `provider: lmstudio` is saved with a remote base_url and no + LM_BASE_URL env var, the picker must probe the saved base_url — not + 127.0.0.1. Regression: prior behavior always probed localhost, so users + with LM Studio on a lab box saw the wrong (or empty) model list. + """ + monkeypatch.setattr("agent.models_dev.fetch_models_dev", lambda: {}) + monkeypatch.setattr(providers_mod, "HERMES_OVERLAYS", {}) + monkeypatch.delenv("LM_BASE_URL", raising=False) + monkeypatch.delenv("LM_API_KEY", raising=False) + + captured: dict = {} + + def _fake_fetch(api_key=None, base_url=None, timeout=5.0): + captured["base_url"] = base_url + captured["api_key"] = api_key + return ["qwen/qwen3-coder-30b"] + + monkeypatch.setattr("hermes_cli.models.fetch_lmstudio_models", _fake_fetch) + + list_authenticated_providers( + current_provider="lmstudio", + current_base_url="http://192.168.1.10:1234/v1", + current_model="qwen/qwen3-coder-30b", + ) + + assert captured["base_url"] == "http://192.168.1.10:1234/v1" + + +def test_lmstudio_picker_lm_base_url_env_wins_over_active_config(monkeypatch): + """LM_BASE_URL env var must still take precedence over the saved + base_url so users can temporarily redirect the picker without editing + config.yaml. + """ + monkeypatch.setattr("agent.models_dev.fetch_models_dev", lambda: {}) + monkeypatch.setattr(providers_mod, "HERMES_OVERLAYS", {}) + monkeypatch.setenv("LM_BASE_URL", "http://override.local:9999/v1") + monkeypatch.delenv("LM_API_KEY", raising=False) + + captured: dict = {} + + def _fake_fetch(api_key=None, base_url=None, timeout=5.0): + captured["base_url"] = base_url + return [] + + monkeypatch.setattr("hermes_cli.models.fetch_lmstudio_models", _fake_fetch) + + list_authenticated_providers( + current_provider="lmstudio", + current_base_url="http://192.168.1.10:1234/v1", + ) + + assert captured["base_url"] == "http://override.local:9999/v1" + + +def test_lmstudio_picker_skips_probe_when_not_configured(monkeypatch): + """If the user has never configured LM Studio (no LM_API_KEY / LM_BASE_URL + and not on lmstudio), the picker must not pay the localhost probe cost + just to discover LM Studio is unavailable. + """ + monkeypatch.setattr("agent.models_dev.fetch_models_dev", lambda: {}) + monkeypatch.setattr(providers_mod, "HERMES_OVERLAYS", {}) + monkeypatch.delenv("LM_BASE_URL", raising=False) + monkeypatch.delenv("LM_API_KEY", raising=False) + + captured: dict = {} + + def _fake_fetch(api_key=None, base_url=None, timeout=5.0): + captured["base_url"] = base_url + return [] + + monkeypatch.setattr("hermes_cli.models.fetch_lmstudio_models", _fake_fetch) + + list_authenticated_providers( + current_provider="openrouter", + current_base_url="https://openrouter.ai/api/v1", + ) + + assert "base_url" not in captured diff --git a/tests/hermes_cli/test_model_validation.py b/tests/hermes_cli/test_model_validation.py index c8e334d698..c81cae4601 100644 --- a/tests/hermes_cli/test_model_validation.py +++ b/tests/hermes_cli/test_model_validation.py @@ -1,6 +1,6 @@ """Tests for provider-aware `/model` validation in hermes_cli.models.""" -from unittest.mock import patch +from unittest.mock import MagicMock, patch from hermes_cli.models import ( azure_foundry_model_api_mode, @@ -8,6 +8,7 @@ from hermes_cli.models import ( fetch_github_model_catalog, curated_models_for_provider, fetch_api_models, + fetch_lmstudio_models, github_model_reasoning_efforts, normalize_copilot_model_id, normalize_opencode_model_id, @@ -638,6 +639,110 @@ class TestValidateApiFallback: assert "http://localhost:8000/v1/models" in result["message"] assert "http://localhost:8000/v1" in result["message"] + def test_fetch_lmstudio_models_filters_embedding_type(self): + mock_resp = MagicMock() + mock_resp.__enter__.return_value = mock_resp + mock_resp.__exit__.return_value = False + mock_resp.read.return_value = ( + b'{"models":[' + b'{"key":"publisher/chat-model","id":"publisher/chat-model","type":"llm"},' + b'{"key":"publisher/embed-model","id":"publisher/embed-model","type":"embedding"}' + b']}' + ) + + with patch("hermes_cli.models.urllib.request.urlopen", return_value=mock_resp): + models = fetch_lmstudio_models(base_url="http://localhost:1234/v1") + + assert models == ["publisher/chat-model"] + + def test_validate_lmstudio_rejects_embedding_models(self): + mock_resp = MagicMock() + mock_resp.__enter__.return_value = mock_resp + mock_resp.__exit__.return_value = False + mock_resp.read.return_value = ( + b'{"models":[' + b'{"key":"publisher/chat-model","id":"publisher/chat-model","type":"llm"},' + b'{"key":"publisher/embed-model","id":"publisher/embed-model","type":"embedding"}' + b']}' + ) + + with patch("hermes_cli.models.urllib.request.urlopen", return_value=mock_resp): + result = validate_requested_model( + "publisher/embed-model", + "lmstudio", + base_url="http://localhost:1234/v1", + ) + + assert result["accepted"] is False + assert result["recognized"] is False + assert "not found in LM Studio's model listing" in result["message"] + + def test_fetch_lmstudio_models_raises_auth_error_on_401(self): + import urllib.error + from hermes_cli.auth import AuthError + import pytest + + http_error = urllib.error.HTTPError( + url="http://localhost:1234/api/v1/models", + code=401, + msg="Unauthorized", + hdrs=None, + fp=None, + ) + + with patch("hermes_cli.models.urllib.request.urlopen", side_effect=http_error): + with pytest.raises(AuthError) as excinfo: + fetch_lmstudio_models(base_url="http://localhost:1234/v1") + + assert excinfo.value.provider == "lmstudio" + assert excinfo.value.code == "auth_rejected" + assert "401" in str(excinfo.value) + + def test_fetch_lmstudio_models_returns_empty_on_network_error(self): + with patch( + "hermes_cli.models.urllib.request.urlopen", + side_effect=ConnectionRefusedError(), + ): + models = fetch_lmstudio_models(base_url="http://localhost:1234/v1") + + assert models == [] + + def test_validate_lmstudio_distinguishes_auth_failure(self): + import urllib.error + + http_error = urllib.error.HTTPError( + url="http://localhost:1234/api/v1/models", + code=401, + msg="Unauthorized", + hdrs=None, + fp=None, + ) + + with patch("hermes_cli.models.urllib.request.urlopen", side_effect=http_error): + result = validate_requested_model( + "publisher/chat-model", + "lmstudio", + base_url="http://localhost:1234/v1", + ) + + assert result["accepted"] is False + assert "401" in result["message"] + assert "LM_API_KEY" in result["message"] + + def test_validate_lmstudio_distinguishes_unreachable(self): + with patch( + "hermes_cli.models.urllib.request.urlopen", + side_effect=ConnectionRefusedError(), + ): + result = validate_requested_model( + "publisher/chat-model", + "lmstudio", + base_url="http://localhost:1234/v1", + ) + + assert result["accepted"] is False + assert "Could not reach LM Studio" in result["message"] + # -- validate — Codex auto-correction ------------------------------------------ diff --git a/tests/hermes_cli/test_runtime_provider_resolution.py b/tests/hermes_cli/test_runtime_provider_resolution.py index 4cff16e5af..7c2a9c519d 100644 --- a/tests/hermes_cli/test_runtime_provider_resolution.py +++ b/tests/hermes_cli/test_runtime_provider_resolution.py @@ -240,6 +240,110 @@ def test_resolve_runtime_provider_ai_gateway(monkeypatch): assert resolved["requested_provider"] == "ai-gateway" +def test_resolve_runtime_provider_lmstudio_uses_token_when_present(monkeypatch): + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "lmstudio") + monkeypatch.setattr( + rp, + "_get_model_config", + lambda: { + "provider": "lmstudio", + "base_url": "http://127.0.0.1:1234/v1", + "default": "publisher/model-a", + }, + ) + monkeypatch.setattr( + rp, + "load_pool", + lambda provider: type("Pool", (), {"has_credentials": lambda self: False})(), + ) + monkeypatch.setattr( + rp, + "resolve_api_key_provider_credentials", + lambda provider: { + "provider": "lmstudio", + "api_key": "lm-token", + "base_url": "http://127.0.0.1:1234/v1", + "source": "LM_API_KEY", + }, + ) + + resolved = rp.resolve_runtime_provider(requested="lmstudio") + + assert resolved["provider"] == "lmstudio" + assert resolved["api_key"] == "lm-token" + assert resolved["api_mode"] == "chat_completions" + assert resolved["base_url"] == "http://127.0.0.1:1234/v1" + + +def test_resolve_runtime_provider_lmstudio_honors_saved_base_url(monkeypatch): + """Pre-existing configs with `provider: lmstudio` + custom base_url must keep working. + + Before this PR, `lmstudio` aliased to `custom`, so a user with a remote + LM Studio (e.g. lab box) could write `provider: "lmstudio"` plus + `base_url: "http://192.168.1.10:1234/v1"` and the custom path honored it. + Now that `lmstudio` is first-class with `inference_base_url=127.0.0.1`, + the saved `base_url` from `model_cfg` must still win — otherwise this + PR is a silent breaking change for those users. + """ + monkeypatch.delenv("LM_API_KEY", raising=False) + monkeypatch.delenv("LM_BASE_URL", raising=False) + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "lmstudio") + monkeypatch.setattr( + rp, + "_get_model_config", + lambda: { + "provider": "lmstudio", + "base_url": "http://192.168.1.10:1234/v1", + "default": "qwen/qwen3-coder-30b", + }, + ) + monkeypatch.setattr( + rp, + "load_pool", + lambda provider: type("Pool", (), {"has_credentials": lambda self: False})(), + ) + # Don't mock resolve_api_key_provider_credentials — exercise the real + # function so we test the end-to-end precedence between model_cfg and + # the pconfig default. + + resolved = rp.resolve_runtime_provider(requested="lmstudio") + + assert resolved["provider"] == "lmstudio" + assert resolved["api_mode"] == "chat_completions" + # The saved base_url must NOT be shadowed by the 127.0.0.1 default. + assert resolved["base_url"] == "http://192.168.1.10:1234/v1" + # No-auth LM Studio: missing LM_API_KEY substitutes the placeholder. + assert resolved["api_key"] == "dummy-lm-api-key" + + +def test_resolve_runtime_provider_lmstudio_base_url_env_wins_over_saved_base_url(monkeypatch): + """LM_BASE_URL should override the saved lmstudio base_url for temporary redirects.""" + monkeypatch.delenv("LM_API_KEY", raising=False) + monkeypatch.setenv("LM_BASE_URL", "http://override.local:9999/v1") + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "lmstudio") + monkeypatch.setattr( + rp, + "_get_model_config", + lambda: { + "provider": "lmstudio", + "base_url": "http://192.168.1.10:1234/v1", + "default": "qwen/qwen3-coder-30b", + }, + ) + monkeypatch.setattr( + rp, + "load_pool", + lambda provider: type("Pool", (), {"has_credentials": lambda self: False})(), + ) + + resolved = rp.resolve_runtime_provider(requested="lmstudio") + + assert resolved["provider"] == "lmstudio" + assert resolved["api_mode"] == "chat_completions" + assert resolved["base_url"] == "http://override.local:9999/v1" + assert resolved["api_key"] == "dummy-lm-api-key" + + def test_resolve_runtime_provider_ai_gateway_explicit_override_skips_pool(monkeypatch): def _unexpected_pool(provider): raise AssertionError(f"load_pool should not be called for {provider}") @@ -1237,6 +1341,21 @@ def test_resolve_provider_openrouter_unchanged(): assert resolve_provider("openrouter") == "openrouter" +def test_resolve_provider_lmstudio_returns_lmstudio(monkeypatch): + """resolve_provider('lmstudio') must return 'lmstudio', not 'custom'. + + Regression for the alias-map bug where 'lmstudio' was rewritten to + 'custom' before the PROVIDER_REGISTRY lookup, bypassing the first-class + LM Studio provider entirely at runtime. + """ + from hermes_cli.auth import resolve_provider + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("OPENROUTER_API_KEY", raising=False) + assert resolve_provider("lmstudio") == "lmstudio" + assert resolve_provider("lm-studio") == "lmstudio" + assert resolve_provider("lm_studio") == "lmstudio" + + def test_custom_provider_runtime_preserves_provider_name(monkeypatch): """resolve_runtime_provider with provider='custom' must return provider='custom'.""" monkeypatch.delenv("OPENAI_API_KEY", raising=False) diff --git a/tests/hermes_cli/test_status_model_provider.py b/tests/hermes_cli/test_status_model_provider.py index d9f8601532..af6b90204c 100644 --- a/tests/hermes_cli/test_status_model_provider.py +++ b/tests/hermes_cli/test_status_model_provider.py @@ -122,3 +122,34 @@ def test_show_status_hides_nous_subscription_section_when_feature_flag_is_off(mo out = capsys.readouterr().out assert "Nous Tool Gateway" not in out + + +def test_show_status_reports_empty_lmstudio_listing_as_reachable(monkeypatch, capsys, tmp_path): + from hermes_cli import status as status_mod + + _patch_common_status_deps(monkeypatch, status_mod, tmp_path) + monkeypatch.setattr( + status_mod, + "load_config", + lambda: { + "model": { + "default": "qwen/qwen3-coder-30b", + "provider": "lmstudio", + "base_url": "http://127.0.0.1:1234/v1", + } + }, + raising=False, + ) + monkeypatch.setattr(status_mod, "resolve_requested_provider", lambda requested=None: "lmstudio", raising=False) + monkeypatch.setattr(status_mod, "resolve_provider", lambda requested=None, **kwargs: "lmstudio", raising=False) + monkeypatch.setattr(status_mod, "provider_label", lambda provider: "LM Studio", raising=False) + monkeypatch.setattr( + "hermes_cli.models.probe_lmstudio_models", + lambda api_key=None, base_url=None, timeout=5.0: [], + ) + + status_mod.show_status(SimpleNamespace(all=False, deep=False)) + + out = capsys.readouterr().out + assert "LM Studio" in out + assert "reachable (0 model(s)) at http://127.0.0.1:1234/v1" in out diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 710888822b..0343cb84bb 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -4168,6 +4168,7 @@ def _(rid, params: dict) -> dict: cfg = _load_cfg() current_provider = getattr(agent, "provider", "") or "" current_model = getattr(agent, "model", "") or _resolve_model() + current_base_url = getattr(agent, "base_url", "") or "" # list_authenticated_providers already populates each provider's # "models" with the curated list (same source as `hermes model` and # classic CLI's /model picker). Do NOT overwrite with live @@ -4176,6 +4177,8 @@ def _(rid, params: dict) -> dict: # TTS, embeddings, rerankers, image/video generators). providers = list_authenticated_providers( current_provider=current_provider, + current_base_url=current_base_url, + current_model=current_model, user_providers=( cfg.get("providers") if isinstance(cfg.get("providers"), dict) else {} ), diff --git a/website/docs/integrations/providers.md b/website/docs/integrations/providers.md index b4e4148cfa..ef46c685a5 100644 --- a/website/docs/integrations/providers.md +++ b/website/docs/integrations/providers.md @@ -38,6 +38,7 @@ You need at least one way to connect to an LLM. Use `hermes model` to switch pro | **Hugging Face** | `HF_TOKEN` in `~/.hermes/.env` (provider: `huggingface`, aliases: `hf`) | | **Google / Gemini** | `GOOGLE_API_KEY` (or `GEMINI_API_KEY`) in `~/.hermes/.env` (provider: `gemini`) | | **Google Gemini (OAuth)** | `hermes model` → "Google Gemini (OAuth)" (provider: `google-gemini-cli`, free tier supported, browser PKCE login) | +| **LM Studio** | `hermes model` → "LM Studio" (provider: `lmstudio`, optional `LM_API_KEY`) | | **Custom Endpoint** | `hermes model` → choose "Custom endpoint" (saved in `config.yaml`) | :::tip Model key alias @@ -725,10 +726,10 @@ Then configure Hermes: ```bash hermes model -# Select "Custom endpoint (self-hosted / VLLM / etc.)" -# Enter URL: http://localhost:1234/v1 -# Skip API key (LM Studio doesn't require one) -# Enter model name +# Select "LM Studio" +# Press Enter to use http://127.0.0.1:1234/v1 +# Pick one of the discovered models +# If LM Studio server auth is enabled, enter LM_API_KEY when prompted ``` :::caution Context length often defaults to 2048