Merge pull request #2215 from NousResearch/hermes/hermes-31d7db3b

fix: infer provider from base URL for models.dev context length lookup
2026-06-10 08:32:09 +00:00 · 2026-03-20 12:52:07 -07:00 · 2026-03-20 12:52:07 -07:00 · 8e884fb3f1
commit 8e884fb3f1
parent f853e50589 59074df021
3 changed files with 66 additions and 18 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -151,22 +151,42 @@ def _is_custom_endpoint(base_url: str) -> bool:
    return bool(normalized) and not _is_openrouter_base_url(normalized)


-def _is_known_provider_base_url(base_url: str) -> bool:
+_URL_TO_PROVIDER: Dict[str, str] = {
+    "api.openai.com": "openai",
+    "chatgpt.com": "openai",
+    "api.anthropic.com": "anthropic",
+    "api.z.ai": "zai",
+    "api.moonshot.ai": "kimi-coding",
+    "api.kimi.com": "kimi-coding",
+    "api.minimax": "minimax",
+    "dashscope.aliyuncs.com": "alibaba",
+    "dashscope-intl.aliyuncs.com": "alibaba",
+    "openrouter.ai": "openrouter",
+    "inference-api.nousresearch.com": "nous",
+    "api.deepseek.com": "deepseek",
+}
+
+
+def _infer_provider_from_url(base_url: str) -> Optional[str]:
+    """Infer the models.dev provider name from a base URL.
+
+    This allows context length resolution via models.dev for custom endpoints
+    like DashScope (Alibaba), Z.AI, Kimi, etc. without requiring the user to
+    explicitly set the provider name in config.
+    """
    normalized = _normalize_base_url(base_url)
    if not normalized:
-        return False
+        return None
    parsed = urlparse(normalized if "://" in normalized else f"https://{normalized}")
    host = parsed.netloc.lower() or parsed.path.lower()
-    known_hosts = (
-        "api.openai.com",
-        "chatgpt.com",
-        "api.anthropic.com",
-        "api.z.ai",
-        "api.moonshot.ai",
-        "api.kimi.com",
-        "api.minimax",
-    )
-    return any(known_host in host for known_host in known_hosts)
+    for url_part, provider in _URL_TO_PROVIDER.items():
+        if url_part in host:
+            return provider
+    return None
+
+
+def _is_known_provider_base_url(base_url: str) -> bool:
+    return _infer_provider_from_url(base_url) is not None


 def is_local_endpoint(base_url: str) -> bool:
@ -808,13 +828,21 @@ def get_model_context_length(
    # These are provider-specific and take priority over the generic OR cache,
    # since the same model can have different context limits per provider
    # (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot).
-    if provider == "nous":
+    # If provider is generic (openrouter/custom/empty), try to infer from URL.
+    effective_provider = provider
+    if not effective_provider or effective_provider in ("openrouter", "custom"):
+        if base_url:
+            inferred = _infer_provider_from_url(base_url)
+            if inferred:
+                effective_provider = inferred
+
+    if effective_provider == "nous":
        ctx = _resolve_nous_context_length(model)
        if ctx:
            return ctx
-    if provider:
+    if effective_provider:
        from agent.models_dev import lookup_models_dev_context
-        ctx = lookup_models_dev_context(provider, model)
+        ctx = lookup_models_dev_context(effective_provider, model)
        if ctx:
            return ctx

--- a/cli.py
+++ b/cli.py
@ -1504,7 +1504,7 @@ class HermesCLI:
            _cprint(f"{_DIM}└{'─' * (w - 2)}┘{_RST}")
            self._reasoning_box_opened = False

-    def _stream_delta(self, text: str) -> None:
+    def _stream_delta(self, text) -> None:
        """Line-buffered streaming callback for real-time token rendering.

        Receives text deltas from the agent as tokens arrive. Buffers
@ -1514,7 +1514,15 @@ class HermesCLI:
        Reasoning/thinking blocks (<REASONING_SCRATCHPAD>, <think>, etc.)
        are suppressed during streaming since they'd display raw XML tags.
        The agent strips them from the final response anyway.
+
+        A ``None`` value signals an intermediate turn boundary (tools are
+        about to execute).  Flushes any open boxes and resets state so
+        tool feed lines render cleanly between turns.
        """
+        if text is None:
+            self._flush_stream()
+            self._reset_stream_state()
+            return
        if not text:
            return

--- a/run_agent.py
+++ b/run_agent.py
@ -4838,7 +4838,7 @@ class AIAgent:
                        spinner.stop(cute_msg)
                    elif self.quiet_mode:
                        self._vprint(f"  {cute_msg}")
-            elif self.quiet_mode and not self._has_stream_consumers():
+            elif self.quiet_mode:
                face = random.choice(KawaiiSpinner.KAWAII_WAITING)
                emoji = _get_tool_emoji(function_name)
                preview = _build_tool_preview(function_name, function_args) or function_name
@ -6568,7 +6568,19 @@ class AIAgent:
                                self._vprint(f"  ┊ 💬 {clean}")
                    
                    messages.append(assistant_msg)
-                    
+
+                    # Close any open streaming display (response box, reasoning
+                    # box) before tool execution begins.  Intermediate turns may
+                    # have streamed early content that opened the response box;
+                    # flushing here prevents it from wrapping tool feed lines.
+                    # Only signal the display callback — TTS (_stream_callback)
+                    # should NOT receive None (it uses None as end-of-stream).
+                    if self.stream_delta_callback:
+                        try:
+                            self.stream_delta_callback(None)
+                        except Exception:
+                            pass
+
                    _msg_count_before_tools = len(messages)
                    self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)