Merge pull request #2215 from NousResearch/hermes/hermes-31d7db3b

fix: infer provider from base URL for models.dev context length lookup
This commit is contained in:
Teknium 2026-03-20 12:52:07 -07:00 committed by GitHub
commit 8e884fb3f1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 66 additions and 18 deletions

View file

@ -151,22 +151,42 @@ def _is_custom_endpoint(base_url: str) -> bool:
return bool(normalized) and not _is_openrouter_base_url(normalized)
def _is_known_provider_base_url(base_url: str) -> bool:
_URL_TO_PROVIDER: Dict[str, str] = {
"api.openai.com": "openai",
"chatgpt.com": "openai",
"api.anthropic.com": "anthropic",
"api.z.ai": "zai",
"api.moonshot.ai": "kimi-coding",
"api.kimi.com": "kimi-coding",
"api.minimax": "minimax",
"dashscope.aliyuncs.com": "alibaba",
"dashscope-intl.aliyuncs.com": "alibaba",
"openrouter.ai": "openrouter",
"inference-api.nousresearch.com": "nous",
"api.deepseek.com": "deepseek",
}
def _infer_provider_from_url(base_url: str) -> Optional[str]:
"""Infer the models.dev provider name from a base URL.
This allows context length resolution via models.dev for custom endpoints
like DashScope (Alibaba), Z.AI, Kimi, etc. without requiring the user to
explicitly set the provider name in config.
"""
normalized = _normalize_base_url(base_url)
if not normalized:
return False
return None
parsed = urlparse(normalized if "://" in normalized else f"https://{normalized}")
host = parsed.netloc.lower() or parsed.path.lower()
known_hosts = (
"api.openai.com",
"chatgpt.com",
"api.anthropic.com",
"api.z.ai",
"api.moonshot.ai",
"api.kimi.com",
"api.minimax",
)
return any(known_host in host for known_host in known_hosts)
for url_part, provider in _URL_TO_PROVIDER.items():
if url_part in host:
return provider
return None
def _is_known_provider_base_url(base_url: str) -> bool:
return _infer_provider_from_url(base_url) is not None
def is_local_endpoint(base_url: str) -> bool:
@ -808,13 +828,21 @@ def get_model_context_length(
# These are provider-specific and take priority over the generic OR cache,
# since the same model can have different context limits per provider
# (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot).
if provider == "nous":
# If provider is generic (openrouter/custom/empty), try to infer from URL.
effective_provider = provider
if not effective_provider or effective_provider in ("openrouter", "custom"):
if base_url:
inferred = _infer_provider_from_url(base_url)
if inferred:
effective_provider = inferred
if effective_provider == "nous":
ctx = _resolve_nous_context_length(model)
if ctx:
return ctx
if provider:
if effective_provider:
from agent.models_dev import lookup_models_dev_context
ctx = lookup_models_dev_context(provider, model)
ctx = lookup_models_dev_context(effective_provider, model)
if ctx:
return ctx

10
cli.py
View file

@ -1504,7 +1504,7 @@ class HermesCLI:
_cprint(f"{_DIM}{'' * (w - 2)}{_RST}")
self._reasoning_box_opened = False
def _stream_delta(self, text: str) -> None:
def _stream_delta(self, text) -> None:
"""Line-buffered streaming callback for real-time token rendering.
Receives text deltas from the agent as tokens arrive. Buffers
@ -1514,7 +1514,15 @@ class HermesCLI:
Reasoning/thinking blocks (<REASONING_SCRATCHPAD>, <think>, etc.)
are suppressed during streaming since they'd display raw XML tags.
The agent strips them from the final response anyway.
A ``None`` value signals an intermediate turn boundary (tools are
about to execute). Flushes any open boxes and resets state so
tool feed lines render cleanly between turns.
"""
if text is None:
self._flush_stream()
self._reset_stream_state()
return
if not text:
return

View file

@ -4838,7 +4838,7 @@ class AIAgent:
spinner.stop(cute_msg)
elif self.quiet_mode:
self._vprint(f" {cute_msg}")
elif self.quiet_mode and not self._has_stream_consumers():
elif self.quiet_mode:
face = random.choice(KawaiiSpinner.KAWAII_WAITING)
emoji = _get_tool_emoji(function_name)
preview = _build_tool_preview(function_name, function_args) or function_name
@ -6568,7 +6568,19 @@ class AIAgent:
self._vprint(f" ┊ 💬 {clean}")
messages.append(assistant_msg)
# Close any open streaming display (response box, reasoning
# box) before tool execution begins. Intermediate turns may
# have streamed early content that opened the response box;
# flushing here prevents it from wrapping tool feed lines.
# Only signal the display callback — TTS (_stream_callback)
# should NOT receive None (it uses None as end-of-stream).
if self.stream_delta_callback:
try:
self.stream_delta_callback(None)
except Exception:
pass
_msg_count_before_tools = len(messages)
self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)