hermes-agent/tests/test_output_cap_parsing.py
xxxigm 57775e9e16 test(agent): cover char-based output-cap overflow parsing (#42741)
Add TestParseCharBasedOutputCap for the LM Studio / llama.cpp phrasing
(context in tokens, prompt in characters): the reported error resolves to
the available output budget, the retried cap plus the estimated input
stays inside the window, and a prompt larger than the window falls through
to None so the prompt-too-long/compression path still owns that case.
2026-06-09 03:17:12 -07:00

64 lines
3.1 KiB
Python

import pytest
from agent.model_metadata import parse_available_output_tokens_from_error
class TestParseOpenRouterOutputCap:
"""OpenRouter/Nous phrase the output-cap error as a context breakdown."""
def test_openrouter_breakdown_format(self):
msg = ("This endpoint's maximum context length is 200000 tokens. "
"However, you requested about 195000 tokens "
"(150000 of text input, 40000 of tool input, 5000 in the output).")
# available output = 200000 - 150000 - 40000 = 10000
assert parse_available_output_tokens_from_error(msg) == 10000
def test_anthropic_format_still_works(self):
msg = ("max_tokens: 32768 > context_window: 200000 - "
"input_tokens: 190000 = available_tokens: 10000")
assert parse_available_output_tokens_from_error(msg) == 10000
def test_non_output_cap_error_returns_none(self):
assert parse_available_output_tokens_from_error("some unrelated 400 error") is None
def test_breakdown_with_no_room_returns_none(self):
# ctx - text - tool <= 0 -> None (don't return a non-positive cap)
msg = ("maximum context length is 1000 tokens "
"(900 of text input, 200 of tool input, 0 in the output)")
assert parse_available_output_tokens_from_error(msg) is None
class TestParseCharBasedOutputCap:
"""LM Studio / llama.cpp report context in tokens but prompt in characters.
These servers send a hard 400 even on a trivial prompt when the default
output cap equals the context window (#42741): the request asks for the
whole window as output, leaving zero room for input.
"""
def test_char_based_output_cap_format(self):
msg = ("This model's maximum context length is 65536 tokens. However, "
"you requested 65536 output tokens and your prompt contains "
"77409 characters (more than 0 characters, which is the upper "
"bound for 0 input tokens). Please reduce the length of the "
"input prompt or the number of requested output tokens.")
# est input = ceil(77409 / 3) = 25803; available = 65536 - 25803 = 39733
assert parse_available_output_tokens_from_error(msg) == 39733
def test_char_based_leaves_room_for_input(self):
# The whole point: the retried output cap + the estimated input must
# fit inside the reported context window.
ctx = 65536
chars = 77409
available = parse_available_output_tokens_from_error(
f"maximum context length is {ctx} tokens. However, you requested "
f"{ctx} output tokens and your prompt contains {chars} characters."
)
assert available is not None
assert available + (chars + 2) // 3 <= ctx
def test_char_based_no_room_returns_none(self):
# Prompt larger than the window (in tokens) -> not an output-cap fix;
# let the prompt-too-long / compression path handle it.
msg = ("maximum context length is 1000 tokens. However, you requested "
"1000 output tokens and your prompt contains 9000 characters.")
assert parse_available_output_tokens_from_error(msg) is None