diff --git a/agent/model_metadata.py b/agent/model_metadata.py index be63719e2..e3636b6fe 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -151,22 +151,42 @@ def _is_custom_endpoint(base_url: str) -> bool: return bool(normalized) and not _is_openrouter_base_url(normalized) -def _is_known_provider_base_url(base_url: str) -> bool: +_URL_TO_PROVIDER: Dict[str, str] = { + "api.openai.com": "openai", + "chatgpt.com": "openai", + "api.anthropic.com": "anthropic", + "api.z.ai": "zai", + "api.moonshot.ai": "kimi-coding", + "api.kimi.com": "kimi-coding", + "api.minimax": "minimax", + "dashscope.aliyuncs.com": "alibaba", + "dashscope-intl.aliyuncs.com": "alibaba", + "openrouter.ai": "openrouter", + "inference-api.nousresearch.com": "nous", + "api.deepseek.com": "deepseek", +} + + +def _infer_provider_from_url(base_url: str) -> Optional[str]: + """Infer the models.dev provider name from a base URL. + + This allows context length resolution via models.dev for custom endpoints + like DashScope (Alibaba), Z.AI, Kimi, etc. without requiring the user to + explicitly set the provider name in config. + """ normalized = _normalize_base_url(base_url) if not normalized: - return False + return None parsed = urlparse(normalized if "://" in normalized else f"https://{normalized}") host = parsed.netloc.lower() or parsed.path.lower() - known_hosts = ( - "api.openai.com", - "chatgpt.com", - "api.anthropic.com", - "api.z.ai", - "api.moonshot.ai", - "api.kimi.com", - "api.minimax", - ) - return any(known_host in host for known_host in known_hosts) + for url_part, provider in _URL_TO_PROVIDER.items(): + if url_part in host: + return provider + return None + + +def _is_known_provider_base_url(base_url: str) -> bool: + return _infer_provider_from_url(base_url) is not None def is_local_endpoint(base_url: str) -> bool: @@ -808,13 +828,21 @@ def get_model_context_length( # These are provider-specific and take priority over the generic OR cache, # since the same model can have different context limits per provider # (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot). - if provider == "nous": + # If provider is generic (openrouter/custom/empty), try to infer from URL. + effective_provider = provider + if not effective_provider or effective_provider in ("openrouter", "custom"): + if base_url: + inferred = _infer_provider_from_url(base_url) + if inferred: + effective_provider = inferred + + if effective_provider == "nous": ctx = _resolve_nous_context_length(model) if ctx: return ctx - if provider: + if effective_provider: from agent.models_dev import lookup_models_dev_context - ctx = lookup_models_dev_context(provider, model) + ctx = lookup_models_dev_context(effective_provider, model) if ctx: return ctx diff --git a/cli.py b/cli.py index 73f83c7d5..9531df7bb 100755 --- a/cli.py +++ b/cli.py @@ -1504,7 +1504,7 @@ class HermesCLI: _cprint(f"{_DIM}└{'─' * (w - 2)}┘{_RST}") self._reasoning_box_opened = False - def _stream_delta(self, text: str) -> None: + def _stream_delta(self, text) -> None: """Line-buffered streaming callback for real-time token rendering. Receives text deltas from the agent as tokens arrive. Buffers @@ -1514,7 +1514,15 @@ class HermesCLI: Reasoning/thinking blocks (, , etc.) are suppressed during streaming since they'd display raw XML tags. The agent strips them from the final response anyway. + + A ``None`` value signals an intermediate turn boundary (tools are + about to execute). Flushes any open boxes and resets state so + tool feed lines render cleanly between turns. """ + if text is None: + self._flush_stream() + self._reset_stream_state() + return if not text: return diff --git a/run_agent.py b/run_agent.py index 1c3b25fe2..0e444b1ad 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4838,7 +4838,7 @@ class AIAgent: spinner.stop(cute_msg) elif self.quiet_mode: self._vprint(f" {cute_msg}") - elif self.quiet_mode and not self._has_stream_consumers(): + elif self.quiet_mode: face = random.choice(KawaiiSpinner.KAWAII_WAITING) emoji = _get_tool_emoji(function_name) preview = _build_tool_preview(function_name, function_args) or function_name @@ -6568,7 +6568,19 @@ class AIAgent: self._vprint(f" ┊ 💬 {clean}") messages.append(assistant_msg) - + + # Close any open streaming display (response box, reasoning + # box) before tool execution begins. Intermediate turns may + # have streamed early content that opened the response box; + # flushing here prevents it from wrapping tool feed lines. + # Only signal the display callback — TTS (_stream_callback) + # should NOT receive None (it uses None as end-of-stream). + if self.stream_delta_callback: + try: + self.stream_delta_callback(None) + except Exception: + pass + _msg_count_before_tools = len(messages) self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)