diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py index fca2e3bd005..257eece57bd 100644 --- a/agent/chat_completion_helpers.py +++ b/agent/chat_completion_helpers.py @@ -1936,6 +1936,20 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta= ), )) + # Zero-chunk guard: stream yielded nothing usable — a provider/upstream + # error or malformed SSE, not a legitimate empty completion. Raise so the + # retry machinery handles it instead of fabricating a successful turn. + if ( + finish_reason is None + and not content_parts + and not reasoning_parts + and not tool_calls_acc + ): + raise RuntimeError( + "Provider returned an empty stream with no finish_reason " + "(possible upstream error or malformed SSE response)." + ) + effective_finish_reason = finish_reason or "stop" if has_truncated_tool_args: effective_finish_reason = "length" diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 0ce9d0c6361..1080256e0ac 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -964,6 +964,10 @@ def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]: is_output_cap_error = ( "max_tokens" in error_lower and ("available_tokens" in error_lower or "available tokens" in error_lower) + ) or ( + # OpenRouter/Nous phrasing of the same condition. + "in the output" in error_lower + and "maximum context length" in error_lower ) if not is_output_cap_error: return None @@ -982,6 +986,19 @@ def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]: tokens = int(match.group(1)) if tokens >= 1: return tokens + + # OpenRouter/Nous format: "maximum context length is N … (A of text input, + # B of tool input, C in the output)". Available output = ctx - text - tool. + _m_ctx = re.search(r'maximum context length is (\d+)', error_lower) + _m_parts = re.search( + r'\((\d+)\s+of text input,\s*(\d+)\s+of tool input,\s*(\d+)\s+in the output\)', + error_lower, + ) + if _m_ctx and _m_parts: + _available = int(_m_ctx.group(1)) - int(_m_parts.group(1)) - int(_m_parts.group(2)) + if _available >= 1: + return _available + return None diff --git a/tests/test_output_cap_parsing.py b/tests/test_output_cap_parsing.py new file mode 100644 index 00000000000..4f989622b14 --- /dev/null +++ b/tests/test_output_cap_parsing.py @@ -0,0 +1,27 @@ +import pytest +from agent.model_metadata import parse_available_output_tokens_from_error + + +class TestParseOpenRouterOutputCap: + """OpenRouter/Nous phrase the output-cap error as a context breakdown.""" + + def test_openrouter_breakdown_format(self): + msg = ("This endpoint's maximum context length is 200000 tokens. " + "However, you requested about 195000 tokens " + "(150000 of text input, 40000 of tool input, 5000 in the output).") + # available output = 200000 - 150000 - 40000 = 10000 + assert parse_available_output_tokens_from_error(msg) == 10000 + + def test_anthropic_format_still_works(self): + msg = ("max_tokens: 32768 > context_window: 200000 - " + "input_tokens: 190000 = available_tokens: 10000") + assert parse_available_output_tokens_from_error(msg) == 10000 + + def test_non_output_cap_error_returns_none(self): + assert parse_available_output_tokens_from_error("some unrelated 400 error") is None + + def test_breakdown_with_no_room_returns_none(self): + # ctx - text - tool <= 0 -> None (don't return a non-positive cap) + msg = ("maximum context length is 1000 tokens " + "(900 of text input, 200 of tool input, 0 in the output)") + assert parse_available_output_tokens_from_error(msg) is None