mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(stream+output-cap): guard empty streams and parse OpenRouter output-cap errors (#40589)
Two isolated reliability fixes:
- chat_completion_helpers: raise on a zero-chunk stream (no finish_reason,
no content/reasoning/tool_calls) so retry handles it instead of
fabricating a successful empty turn.
- model_metadata: parse the OpenRouter/Nous output-cap error phrasing
("maximum context length is N ... (A of text input, B of tool input,
C in the output)") so parse_available_output_tokens_from_error returns
a real cap and the caller stops looping on it.
Salvaged from #40405 (@ashishpatel26) — took the two stream/error-parsing
fixes. The PR also bundled compression-state changes (on_session_start
clearing _previous_summary; cron session-id prefix preservation, #38788);
those touch the compression hot path and are split out for separate review.
Co-authored-by: ashishpatel26 <ashishpatel26@users.noreply.github.com>
This commit is contained in:
parent
02aad08acf
commit
1fb99b1f22
3 changed files with 58 additions and 0 deletions
|
|
@ -1936,6 +1936,20 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
|
|||
),
|
||||
))
|
||||
|
||||
# Zero-chunk guard: stream yielded nothing usable — a provider/upstream
|
||||
# error or malformed SSE, not a legitimate empty completion. Raise so the
|
||||
# retry machinery handles it instead of fabricating a successful turn.
|
||||
if (
|
||||
finish_reason is None
|
||||
and not content_parts
|
||||
and not reasoning_parts
|
||||
and not tool_calls_acc
|
||||
):
|
||||
raise RuntimeError(
|
||||
"Provider returned an empty stream with no finish_reason "
|
||||
"(possible upstream error or malformed SSE response)."
|
||||
)
|
||||
|
||||
effective_finish_reason = finish_reason or "stop"
|
||||
if has_truncated_tool_args:
|
||||
effective_finish_reason = "length"
|
||||
|
|
|
|||
|
|
@ -964,6 +964,10 @@ def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
|
|||
is_output_cap_error = (
|
||||
"max_tokens" in error_lower
|
||||
and ("available_tokens" in error_lower or "available tokens" in error_lower)
|
||||
) or (
|
||||
# OpenRouter/Nous phrasing of the same condition.
|
||||
"in the output" in error_lower
|
||||
and "maximum context length" in error_lower
|
||||
)
|
||||
if not is_output_cap_error:
|
||||
return None
|
||||
|
|
@ -982,6 +986,19 @@ def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
|
|||
tokens = int(match.group(1))
|
||||
if tokens >= 1:
|
||||
return tokens
|
||||
|
||||
# OpenRouter/Nous format: "maximum context length is N … (A of text input,
|
||||
# B of tool input, C in the output)". Available output = ctx - text - tool.
|
||||
_m_ctx = re.search(r'maximum context length is (\d+)', error_lower)
|
||||
_m_parts = re.search(
|
||||
r'\((\d+)\s+of text input,\s*(\d+)\s+of tool input,\s*(\d+)\s+in the output\)',
|
||||
error_lower,
|
||||
)
|
||||
if _m_ctx and _m_parts:
|
||||
_available = int(_m_ctx.group(1)) - int(_m_parts.group(1)) - int(_m_parts.group(2))
|
||||
if _available >= 1:
|
||||
return _available
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
|
|
|||
27
tests/test_output_cap_parsing.py
Normal file
27
tests/test_output_cap_parsing.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
import pytest
|
||||
from agent.model_metadata import parse_available_output_tokens_from_error
|
||||
|
||||
|
||||
class TestParseOpenRouterOutputCap:
|
||||
"""OpenRouter/Nous phrase the output-cap error as a context breakdown."""
|
||||
|
||||
def test_openrouter_breakdown_format(self):
|
||||
msg = ("This endpoint's maximum context length is 200000 tokens. "
|
||||
"However, you requested about 195000 tokens "
|
||||
"(150000 of text input, 40000 of tool input, 5000 in the output).")
|
||||
# available output = 200000 - 150000 - 40000 = 10000
|
||||
assert parse_available_output_tokens_from_error(msg) == 10000
|
||||
|
||||
def test_anthropic_format_still_works(self):
|
||||
msg = ("max_tokens: 32768 > context_window: 200000 - "
|
||||
"input_tokens: 190000 = available_tokens: 10000")
|
||||
assert parse_available_output_tokens_from_error(msg) == 10000
|
||||
|
||||
def test_non_output_cap_error_returns_none(self):
|
||||
assert parse_available_output_tokens_from_error("some unrelated 400 error") is None
|
||||
|
||||
def test_breakdown_with_no_room_returns_none(self):
|
||||
# ctx - text - tool <= 0 -> None (don't return a non-positive cap)
|
||||
msg = ("maximum context length is 1000 tokens "
|
||||
"(900 of text input, 200 of tool input, 0 in the output)")
|
||||
assert parse_available_output_tokens_from_error(msg) is None
|
||||
Loading…
Add table
Add a link
Reference in a new issue