mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat: capture provider rate limit headers and show in /usage (#6541)
Parse x-ratelimit-* headers from inference API responses (Nous Portal,
OpenRouter, OpenAI-compatible) and display them in the /usage command.
- New agent/rate_limit_tracker.py: parse 12 rate limit headers (RPM/RPH/
TPM/TPH limits, remaining, reset timers), format as progress bars (CLI)
or compact one-liner (gateway)
- Hook into streaming path in run_agent.py: stream.response.headers is
available on the OpenAI SDK Stream object before chunks are consumed
- CLI /usage: appends rate limit section with progress bars + warnings
when any bucket exceeds 80%
- Gateway /usage: appends compact rate limit summary
- 24 unit tests covering parsing, formatting, edge cases
Headers captured per response:
x-ratelimit-{limit,remaining,reset}-{requests,tokens}{,-1h}
Example CLI display:
Nous Rate Limits (captured just now):
Requests/min [░░░░░░░░░░░░░░░░░░░░] 0.1% 1/800 used (799 left, resets in 59s)
Tokens/hr [░░░░░░░░░░░░░░░░░░░░] 0.0% 49/336.0M (336.0M left, resets in 52m)
This commit is contained in:
parent
3c8ec7037c
commit
8dfc96dbbb
6 changed files with 519 additions and 15 deletions
23
cli.py
23
cli.py
|
|
@ -5409,12 +5409,27 @@ class HermesCLI:
|
|||
print(f" ❌ Compression failed: {e}")
|
||||
|
||||
def _show_usage(self):
|
||||
"""Show cumulative token usage for the current session."""
|
||||
"""Show rate limits (if available) and session token usage."""
|
||||
if not self.agent:
|
||||
print("(._.) No active agent -- send a message first.")
|
||||
return
|
||||
|
||||
agent = self.agent
|
||||
calls = agent.session_api_calls
|
||||
|
||||
if calls == 0:
|
||||
print("(._.) No API calls made yet in this session.")
|
||||
return
|
||||
|
||||
# ── Rate limits (shown first when available) ────────────────
|
||||
rl_state = agent.get_rate_limit_state()
|
||||
if rl_state and rl_state.has_data:
|
||||
from agent.rate_limit_tracker import format_rate_limit_display
|
||||
print()
|
||||
print(format_rate_limit_display(rl_state))
|
||||
print()
|
||||
|
||||
# ── Session token usage ─────────────────────────────────────
|
||||
input_tokens = getattr(agent, "session_input_tokens", 0) or 0
|
||||
output_tokens = getattr(agent, "session_output_tokens", 0) or 0
|
||||
cache_read_tokens = getattr(agent, "session_cache_read_tokens", 0) or 0
|
||||
|
|
@ -5422,13 +5437,7 @@ class HermesCLI:
|
|||
prompt = agent.session_prompt_tokens
|
||||
completion = agent.session_completion_tokens
|
||||
total = agent.session_total_tokens
|
||||
calls = agent.session_api_calls
|
||||
|
||||
if calls == 0:
|
||||
print("(._.) No API calls made yet in this session.")
|
||||
return
|
||||
|
||||
# Current context window state
|
||||
compressor = agent.context_compressor
|
||||
last_prompt = compressor.last_prompt_tokens
|
||||
ctx_len = compressor.context_length
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue