fix(gateway): /usage now shows rate limits, cost, and token details between turns (#7038)

The gateway /usage handler only looked in _running_agents for the agent
object, which is only populated while the agent is actively processing a
message. Between turns (when users actually type /usage), the dict is
empty and the handler fell through to a rough message-count estimate.

The agent object actually lives in _agent_cache between turns (kept for
prompt caching). This fix checks both dicts, with _running_agents taking
priority (mid-turn) and _agent_cache as the between-turns fallback.

Also brings the gateway output to parity with the CLI /usage:
- Model name
- Detailed token breakdown (input, output, cache read, cache write)
- Cost estimation (estimated amount or 'included' for subscriptions)
- Cache token lines hidden when zero (cleaner output)

This fixes Nous Portal rate limit headers not showing up for gateway
users — the data was being captured correctly but the handler could
never see it.
This commit is contained in:
Teknium 2026-04-10 02:33:01 -07:00 committed by GitHub
parent 8779a268a7
commit 6da952bc50
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 233 additions and 7 deletions

View file

@ -5274,27 +5274,76 @@ class GatewayRunner:
)
async def _handle_usage_command(self, event: MessageEvent) -> str:
"""Handle /usage command -- show token usage for the session's last agent run."""
"""Handle /usage command -- show token usage for the current session.
Checks both _running_agents (mid-turn) and _agent_cache (between turns)
so that rate limits, cost estimates, and detailed token breakdowns are
available whenever the user asks, not only while the agent is running.
"""
source = event.source
session_key = self._session_key_for_source(source)
# Try running agent first (mid-turn), then cached agent (between turns)
agent = self._running_agents.get(session_key)
if not agent or agent is _AGENT_PENDING_SENTINEL:
_cache_lock = getattr(self, "_agent_cache_lock", None)
_cache = getattr(self, "_agent_cache", None)
if _cache_lock and _cache is not None:
with _cache_lock:
cached = _cache.get(session_key)
if cached:
agent = cached[0]
if agent and hasattr(agent, "session_total_tokens") and agent.session_api_calls > 0:
lines = []
# Rate limits first (when available from provider headers)
# Rate limits (when available from provider headers)
rl_state = agent.get_rate_limit_state()
if rl_state and rl_state.has_data:
from agent.rate_limit_tracker import format_rate_limit_compact
lines.append(f"⏱️ **Rate Limits:** {format_rate_limit_compact(rl_state)}")
lines.append("")
# Session token usage
# Session token usage — detailed breakdown matching CLI
input_tokens = getattr(agent, "session_input_tokens", 0) or 0
output_tokens = getattr(agent, "session_output_tokens", 0) or 0
cache_read = getattr(agent, "session_cache_read_tokens", 0) or 0
cache_write = getattr(agent, "session_cache_write_tokens", 0) or 0
lines.append("📊 **Session Token Usage**")
lines.append(f"Prompt (input): {agent.session_prompt_tokens:,}")
lines.append(f"Completion (output): {agent.session_completion_tokens:,}")
lines.append(f"Model: `{agent.model}`")
lines.append(f"Input tokens: {input_tokens:,}")
if cache_read:
lines.append(f"Cache read tokens: {cache_read:,}")
if cache_write:
lines.append(f"Cache write tokens: {cache_write:,}")
lines.append(f"Output tokens: {output_tokens:,}")
lines.append(f"Total: {agent.session_total_tokens:,}")
lines.append(f"API calls: {agent.session_api_calls}")
# Cost estimation
try:
from agent.usage_pricing import CanonicalUsage, estimate_usage_cost
cost_result = estimate_usage_cost(
agent.model,
CanonicalUsage(
input_tokens=input_tokens,
output_tokens=output_tokens,
cache_read_tokens=cache_read,
cache_write_tokens=cache_write,
),
provider=getattr(agent, "provider", None),
base_url=getattr(agent, "base_url", None),
)
if cost_result.amount_usd is not None:
prefix = "~" if cost_result.status == "estimated" else ""
lines.append(f"Cost: {prefix}${float(cost_result.amount_usd):.4f}")
elif cost_result.status == "included":
lines.append("Cost: included")
except Exception:
pass
# Context window and compressions
ctx = agent.context_compressor
if ctx.last_prompt_tokens:
pct = min(100, ctx.last_prompt_tokens / ctx.context_length * 100) if ctx.context_length else 0
@ -5304,7 +5353,7 @@ class GatewayRunner:
return "\n".join(lines)
# No running agent -- check session history for a rough count
# No agent at all -- check session history for a rough count
session_entry = self.session_store.get_or_create_session(source)
history = self.session_store.load_transcript(session_entry.session_id)
if history:
@ -5315,7 +5364,7 @@ class GatewayRunner:
f"📊 **Session Info**\n"
f"Messages: {len(msgs)}\n"
f"Estimated context: ~{approx:,} tokens\n"
f"_(Detailed usage available during active conversations)_"
f"_(Detailed usage available after the first agent response)_"
)
return "No usage data available for this session."

View file

@ -0,0 +1,177 @@
"""Tests for gateway /usage command — agent cache lookup and output fields."""
import asyncio
import threading
from unittest.mock import MagicMock, patch
import pytest
def _make_mock_agent(**overrides):
"""Create a mock AIAgent with realistic session counters."""
agent = MagicMock()
defaults = {
"model": "anthropic/claude-sonnet-4.6",
"provider": "openrouter",
"base_url": None,
"session_total_tokens": 50_000,
"session_api_calls": 5,
"session_prompt_tokens": 40_000,
"session_completion_tokens": 10_000,
"session_input_tokens": 35_000,
"session_output_tokens": 10_000,
"session_cache_read_tokens": 5_000,
"session_cache_write_tokens": 2_000,
}
defaults.update(overrides)
for k, v in defaults.items():
setattr(agent, k, v)
# Rate limit state
rl = MagicMock()
rl.has_data = True
agent.get_rate_limit_state.return_value = rl
# Context compressor
ctx = MagicMock()
ctx.last_prompt_tokens = 30_000
ctx.context_length = 200_000
ctx.compression_count = 1
agent.context_compressor = ctx
return agent
def _make_runner(session_key, agent=None, cached_agent=None):
"""Build a bare GatewayRunner with just the fields _handle_usage_command needs."""
from gateway.run import GatewayRunner, _AGENT_PENDING_SENTINEL
runner = object.__new__(GatewayRunner)
runner._running_agents = {}
runner._running_agents_ts = {}
runner._agent_cache = {}
runner._agent_cache_lock = threading.Lock()
runner.session_store = MagicMock()
if agent is not None:
runner._running_agents[session_key] = agent
if cached_agent is not None:
runner._agent_cache[session_key] = (cached_agent, "sig")
# Wire helper
runner._session_key_for_source = MagicMock(return_value=session_key)
return runner
SK = "agent:main:telegram:private:12345"
class TestUsageCachedAgent:
"""The main fix: /usage should find agents in _agent_cache between turns."""
@pytest.mark.asyncio
async def test_cached_agent_shows_detailed_usage(self):
agent = _make_mock_agent()
runner = _make_runner(SK, cached_agent=agent)
event = MagicMock()
with patch("agent.rate_limit_tracker.format_rate_limit_compact", return_value="RPM: 50/60"), \
patch("agent.usage_pricing.estimate_usage_cost") as mock_cost:
mock_cost.return_value = MagicMock(amount_usd=0.1234, status="estimated")
result = await runner._handle_usage_command(event)
assert "claude-sonnet-4.6" in result
assert "35,000" in result # input tokens
assert "10,000" in result # output tokens
assert "5,000" in result # cache read
assert "2,000" in result # cache write
assert "50,000" in result # total
assert "$0.1234" in result
assert "30,000" in result # context
assert "Compressions: 1" in result
@pytest.mark.asyncio
async def test_running_agent_preferred_over_cache(self):
"""When agent is in both dicts, the running one wins."""
running = _make_mock_agent(session_api_calls=10, session_total_tokens=80_000)
cached = _make_mock_agent(session_api_calls=5, session_total_tokens=50_000)
runner = _make_runner(SK, agent=running, cached_agent=cached)
event = MagicMock()
with patch("agent.rate_limit_tracker.format_rate_limit_compact", return_value="RPM: 50/60"), \
patch("agent.usage_pricing.estimate_usage_cost") as mock_cost:
mock_cost.return_value = MagicMock(amount_usd=None, status="unknown")
result = await runner._handle_usage_command(event)
assert "80,000" in result # running agent's total
assert "API calls: 10" in result
@pytest.mark.asyncio
async def test_sentinel_skipped_uses_cache(self):
"""PENDING sentinel in _running_agents should fall through to cache."""
from gateway.run import _AGENT_PENDING_SENTINEL
cached = _make_mock_agent()
runner = _make_runner(SK, cached_agent=cached)
runner._running_agents[SK] = _AGENT_PENDING_SENTINEL
event = MagicMock()
with patch("agent.rate_limit_tracker.format_rate_limit_compact", return_value="RPM: 50/60"), \
patch("agent.usage_pricing.estimate_usage_cost") as mock_cost:
mock_cost.return_value = MagicMock(amount_usd=None, status="unknown")
result = await runner._handle_usage_command(event)
assert "claude-sonnet-4.6" in result
assert "Session Token Usage" in result
@pytest.mark.asyncio
async def test_no_agent_anywhere_falls_to_history(self):
"""No running or cached agent → rough estimate from transcript."""
runner = _make_runner(SK)
event = MagicMock()
session_entry = MagicMock()
session_entry.session_id = "sess123"
runner.session_store.get_or_create_session.return_value = session_entry
runner.session_store.load_transcript.return_value = [
{"role": "user", "content": "hello"},
{"role": "assistant", "content": "hi there"},
]
with patch("agent.model_metadata.estimate_messages_tokens_rough", return_value=500):
result = await runner._handle_usage_command(event)
assert "Session Info" in result
assert "Messages: 2" in result
assert "~500" in result
@pytest.mark.asyncio
async def test_cache_read_write_hidden_when_zero(self):
"""Cache token lines should be omitted when zero."""
agent = _make_mock_agent(session_cache_read_tokens=0, session_cache_write_tokens=0)
runner = _make_runner(SK, cached_agent=agent)
event = MagicMock()
with patch("agent.rate_limit_tracker.format_rate_limit_compact", return_value="RPM: 50/60"), \
patch("agent.usage_pricing.estimate_usage_cost") as mock_cost:
mock_cost.return_value = MagicMock(amount_usd=None, status="unknown")
result = await runner._handle_usage_command(event)
assert "Cache read" not in result
assert "Cache write" not in result
@pytest.mark.asyncio
async def test_cost_included_status(self):
"""Subscription-included providers show 'included' instead of dollar amount."""
agent = _make_mock_agent(provider="openai-codex")
runner = _make_runner(SK, cached_agent=agent)
event = MagicMock()
with patch("agent.rate_limit_tracker.format_rate_limit_compact", return_value="RPM: 50/60"), \
patch("agent.usage_pricing.estimate_usage_cost") as mock_cost:
mock_cost.return_value = MagicMock(amount_usd=None, status="included")
result = await runner._handle_usage_command(event)
assert "Cost: included" in result