mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(/model): show provider-enforced context length, not raw models.dev (#15438)
/model gpt-5.5 on openai-codex showed 'Context: 1,050,000 tokens' because the display block used ModelInfo.context_window directly from models.dev. Codex OAuth actually enforces 272K for the same slug, and the agent's compressor already runs at 272K via get_model_context_length() — so the banner + real context budget said 272K while /model lied with 1M. Route the display context through a new resolve_display_context_length() helper that always prefers agent.model_metadata.get_model_context_length (which knows about Codex OAuth, Copilot, Nous caps) and only falls back to models.dev when that returns nothing. Fix applied to all 3 /model display sites: cli.py _handle_model_switch gateway/run.py picker on_model_selected callback gateway/run.py text-fallback confirmation Reported by @emilstridell (Telegram, April 2026).
This commit is contained in:
parent
13038dc747
commit
05d8f11085
4 changed files with 161 additions and 33 deletions
29
cli.py
29
cli.py
|
|
@ -5374,29 +5374,26 @@ class HermesCLI:
|
||||||
_cprint(f" ✓ Model switched: {result.new_model}")
|
_cprint(f" ✓ Model switched: {result.new_model}")
|
||||||
_cprint(f" Provider: {provider_label}")
|
_cprint(f" Provider: {provider_label}")
|
||||||
|
|
||||||
# Rich metadata from models.dev
|
# Context: always resolve via the provider-aware chain so Codex OAuth,
|
||||||
|
# Copilot, and Nous-enforced caps win over the raw models.dev entry
|
||||||
|
# (e.g. gpt-5.5 is 1.05M on openai but 272K on Codex OAuth).
|
||||||
mi = result.model_info
|
mi = result.model_info
|
||||||
|
from hermes_cli.model_switch import resolve_display_context_length
|
||||||
|
ctx = resolve_display_context_length(
|
||||||
|
result.new_model,
|
||||||
|
result.target_provider,
|
||||||
|
base_url=result.base_url or self.base_url or "",
|
||||||
|
api_key=result.api_key or self.api_key or "",
|
||||||
|
model_info=mi,
|
||||||
|
)
|
||||||
|
if ctx:
|
||||||
|
_cprint(f" Context: {ctx:,} tokens")
|
||||||
if mi:
|
if mi:
|
||||||
if mi.context_window:
|
|
||||||
_cprint(f" Context: {mi.context_window:,} tokens")
|
|
||||||
if mi.max_output:
|
if mi.max_output:
|
||||||
_cprint(f" Max output: {mi.max_output:,} tokens")
|
_cprint(f" Max output: {mi.max_output:,} tokens")
|
||||||
if mi.has_cost_data():
|
if mi.has_cost_data():
|
||||||
_cprint(f" Cost: {mi.format_cost()}")
|
_cprint(f" Cost: {mi.format_cost()}")
|
||||||
_cprint(f" Capabilities: {mi.format_capabilities()}")
|
_cprint(f" Capabilities: {mi.format_capabilities()}")
|
||||||
else:
|
|
||||||
# Fallback to old context length lookup
|
|
||||||
try:
|
|
||||||
from agent.model_metadata import get_model_context_length
|
|
||||||
ctx = get_model_context_length(
|
|
||||||
result.new_model,
|
|
||||||
base_url=result.base_url or self.base_url,
|
|
||||||
api_key=result.api_key or self.api_key,
|
|
||||||
provider=result.target_provider,
|
|
||||||
)
|
|
||||||
_cprint(f" Context: {ctx:,} tokens")
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Cache notice
|
# Cache notice
|
||||||
cache_enabled = (
|
cache_enabled = (
|
||||||
|
|
|
||||||
|
|
@ -5659,9 +5659,17 @@ class GatewayRunner:
|
||||||
lines = [f"Model switched to `{result.new_model}`"]
|
lines = [f"Model switched to `{result.new_model}`"]
|
||||||
lines.append(f"Provider: {plabel}")
|
lines.append(f"Provider: {plabel}")
|
||||||
mi = result.model_info
|
mi = result.model_info
|
||||||
|
from hermes_cli.model_switch import resolve_display_context_length
|
||||||
|
ctx = resolve_display_context_length(
|
||||||
|
result.new_model,
|
||||||
|
result.target_provider,
|
||||||
|
base_url=result.base_url or current_base_url or "",
|
||||||
|
api_key=result.api_key or current_api_key or "",
|
||||||
|
model_info=mi,
|
||||||
|
)
|
||||||
|
if ctx:
|
||||||
|
lines.append(f"Context: {ctx:,} tokens")
|
||||||
if mi:
|
if mi:
|
||||||
if mi.context_window:
|
|
||||||
lines.append(f"Context: {mi.context_window:,} tokens")
|
|
||||||
if mi.max_output:
|
if mi.max_output:
|
||||||
lines.append(f"Max output: {mi.max_output:,} tokens")
|
lines.append(f"Max output: {mi.max_output:,} tokens")
|
||||||
if mi.has_cost_data():
|
if mi.has_cost_data():
|
||||||
|
|
@ -5795,28 +5803,25 @@ class GatewayRunner:
|
||||||
lines = [f"Model switched to `{result.new_model}`"]
|
lines = [f"Model switched to `{result.new_model}`"]
|
||||||
lines.append(f"Provider: {provider_label}")
|
lines.append(f"Provider: {provider_label}")
|
||||||
|
|
||||||
# Rich metadata from models.dev
|
# Context: always resolve via the provider-aware chain so Codex OAuth,
|
||||||
|
# Copilot, and Nous-enforced caps win over the raw models.dev entry.
|
||||||
mi = result.model_info
|
mi = result.model_info
|
||||||
|
from hermes_cli.model_switch import resolve_display_context_length
|
||||||
|
ctx = resolve_display_context_length(
|
||||||
|
result.new_model,
|
||||||
|
result.target_provider,
|
||||||
|
base_url=result.base_url or current_base_url or "",
|
||||||
|
api_key=result.api_key or current_api_key or "",
|
||||||
|
model_info=mi,
|
||||||
|
)
|
||||||
|
if ctx:
|
||||||
|
lines.append(f"Context: {ctx:,} tokens")
|
||||||
if mi:
|
if mi:
|
||||||
if mi.context_window:
|
|
||||||
lines.append(f"Context: {mi.context_window:,} tokens")
|
|
||||||
if mi.max_output:
|
if mi.max_output:
|
||||||
lines.append(f"Max output: {mi.max_output:,} tokens")
|
lines.append(f"Max output: {mi.max_output:,} tokens")
|
||||||
if mi.has_cost_data():
|
if mi.has_cost_data():
|
||||||
lines.append(f"Cost: {mi.format_cost()}")
|
lines.append(f"Cost: {mi.format_cost()}")
|
||||||
lines.append(f"Capabilities: {mi.format_capabilities()}")
|
lines.append(f"Capabilities: {mi.format_capabilities()}")
|
||||||
else:
|
|
||||||
try:
|
|
||||||
from agent.model_metadata import get_model_context_length
|
|
||||||
ctx = get_model_context_length(
|
|
||||||
result.new_model,
|
|
||||||
base_url=result.base_url or current_base_url,
|
|
||||||
api_key=result.api_key or current_api_key,
|
|
||||||
provider=result.target_provider,
|
|
||||||
)
|
|
||||||
lines.append(f"Context: {ctx:,} tokens")
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Cache notice
|
# Cache notice
|
||||||
cache_enabled = (
|
cache_enabled = (
|
||||||
|
|
|
||||||
|
|
@ -527,6 +527,42 @@ def _resolve_alias_fallback(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_display_context_length(
|
||||||
|
model: str,
|
||||||
|
provider: str,
|
||||||
|
base_url: str = "",
|
||||||
|
api_key: str = "",
|
||||||
|
model_info: Optional[ModelInfo] = None,
|
||||||
|
) -> Optional[int]:
|
||||||
|
"""Resolve the context length to show in /model output.
|
||||||
|
|
||||||
|
models.dev reports per-vendor context (e.g. gpt-5.5 = 1.05M on openai)
|
||||||
|
but provider-enforced limits can be lower (e.g. Codex OAuth caps the
|
||||||
|
same slug at 272k). The authoritative source is
|
||||||
|
``agent.model_metadata.get_model_context_length`` which already knows
|
||||||
|
about Codex OAuth, Copilot, Nous, and falls back to models.dev for the
|
||||||
|
rest.
|
||||||
|
|
||||||
|
Prefer the provider-aware value; fall back to ``model_info.context_window``
|
||||||
|
only if the resolver returns nothing.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from agent.model_metadata import get_model_context_length
|
||||||
|
ctx = get_model_context_length(
|
||||||
|
model,
|
||||||
|
base_url=base_url or "",
|
||||||
|
api_key=api_key or "",
|
||||||
|
provider=provider or None,
|
||||||
|
)
|
||||||
|
if ctx:
|
||||||
|
return int(ctx)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if model_info is not None and model_info.context_window:
|
||||||
|
return int(model_info.context_window)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Core model-switching pipeline
|
# Core model-switching pipeline
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
90
tests/hermes_cli/test_model_switch_context_display.py
Normal file
90
tests/hermes_cli/test_model_switch_context_display.py
Normal file
|
|
@ -0,0 +1,90 @@
|
||||||
|
"""Regression test for /model context-length display on provider-capped models.
|
||||||
|
|
||||||
|
Bug (April 2026): `/model gpt-5.5` on openai-codex (ChatGPT OAuth) showed
|
||||||
|
"Context: 1,050,000 tokens" because the display code used the raw models.dev
|
||||||
|
``ModelInfo.context_window`` (which reports the direct-OpenAI API value) instead
|
||||||
|
of the provider-aware resolver. The agent was actually running at 272K — Codex
|
||||||
|
OAuth's enforced cap — so the display was lying to the user.
|
||||||
|
|
||||||
|
Fix: ``resolve_display_context_length()`` prefers
|
||||||
|
``agent.model_metadata.get_model_context_length`` (which knows about Codex OAuth,
|
||||||
|
Copilot, Nous, etc.) and falls back to models.dev only if that returns nothing.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from hermes_cli.model_switch import resolve_display_context_length
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeModelInfo:
|
||||||
|
def __init__(self, ctx):
|
||||||
|
self.context_window = ctx
|
||||||
|
|
||||||
|
|
||||||
|
class TestResolveDisplayContextLength:
|
||||||
|
def test_codex_oauth_overrides_models_dev(self):
|
||||||
|
"""gpt-5.5 on openai-codex must show Codex's 272K cap, not models.dev's 1.05M."""
|
||||||
|
fake_mi = _FakeModelInfo(1_050_000) # what models.dev reports
|
||||||
|
with patch(
|
||||||
|
"agent.model_metadata.get_model_context_length",
|
||||||
|
return_value=272_000, # what Codex OAuth actually enforces
|
||||||
|
):
|
||||||
|
ctx = resolve_display_context_length(
|
||||||
|
"gpt-5.5",
|
||||||
|
"openai-codex",
|
||||||
|
base_url="https://chatgpt.com/backend-api/codex",
|
||||||
|
api_key="",
|
||||||
|
model_info=fake_mi,
|
||||||
|
)
|
||||||
|
assert ctx == 272_000, (
|
||||||
|
"Codex OAuth's 272K cap must win over models.dev's 1.05M for gpt-5.5"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_falls_back_to_model_info_when_resolver_returns_none(self):
|
||||||
|
fake_mi = _FakeModelInfo(1_048_576)
|
||||||
|
with patch(
|
||||||
|
"agent.model_metadata.get_model_context_length", return_value=None
|
||||||
|
):
|
||||||
|
ctx = resolve_display_context_length(
|
||||||
|
"some-model",
|
||||||
|
"some-provider",
|
||||||
|
model_info=fake_mi,
|
||||||
|
)
|
||||||
|
assert ctx == 1_048_576
|
||||||
|
|
||||||
|
def test_returns_none_when_both_sources_empty(self):
|
||||||
|
with patch(
|
||||||
|
"agent.model_metadata.get_model_context_length", return_value=None
|
||||||
|
):
|
||||||
|
ctx = resolve_display_context_length(
|
||||||
|
"unknown-model",
|
||||||
|
"unknown-provider",
|
||||||
|
model_info=None,
|
||||||
|
)
|
||||||
|
assert ctx is None
|
||||||
|
|
||||||
|
def test_resolver_exception_falls_back_to_model_info(self):
|
||||||
|
fake_mi = _FakeModelInfo(200_000)
|
||||||
|
with patch(
|
||||||
|
"agent.model_metadata.get_model_context_length",
|
||||||
|
side_effect=RuntimeError("network down"),
|
||||||
|
):
|
||||||
|
ctx = resolve_display_context_length(
|
||||||
|
"x", "y", model_info=fake_mi
|
||||||
|
)
|
||||||
|
assert ctx == 200_000
|
||||||
|
|
||||||
|
def test_prefers_resolver_even_when_model_info_has_larger_value(self):
|
||||||
|
"""Invariant: provider-aware resolver is authoritative, even if models.dev
|
||||||
|
reports a bigger window."""
|
||||||
|
fake_mi = _FakeModelInfo(2_000_000)
|
||||||
|
with patch(
|
||||||
|
"agent.model_metadata.get_model_context_length", return_value=128_000
|
||||||
|
):
|
||||||
|
ctx = resolve_display_context_length(
|
||||||
|
"capped-model",
|
||||||
|
"capped-provider",
|
||||||
|
model_info=fake_mi,
|
||||||
|
)
|
||||||
|
assert ctx == 128_000
|
||||||
Loading…
Add table
Add a link
Reference in a new issue