diff --git a/gateway/platforms/api_server.py b/gateway/platforms/api_server.py index faee4c23b61..357ecbd4785 100644 --- a/gateway/platforms/api_server.py +++ b/gateway/platforms/api_server.py @@ -1206,10 +1206,49 @@ class APIServerAdapter(BasePlatformAdapter): status=500, ) - final_response = result.get("final_response", "") - if not final_response: - final_response = result.get("error", "(No response generated)") + final_response = result.get("final_response") or "" + is_partial = bool(result.get("partial")) + is_failed = bool(result.get("failed")) + completed = bool(result.get("completed", True)) + err_msg = result.get("error") + # Decide finish_reason. OpenAI uses "length" for truncation, "stop" + # for normal completion, and downstream SDKs accept "error" / custom + # codes. See issue #22496. + if is_partial and err_msg and "truncat" in err_msg.lower(): + finish_reason = "length" + elif is_failed or (not completed and err_msg): + finish_reason = "error" + else: + finish_reason = "stop" + + response_headers = { + "X-Hermes-Session-Id": result.get("session_id", session_id), + } + if gateway_session_key: + response_headers["X-Hermes-Session-Key"] = gateway_session_key + + # Hard-fail path: no usable assistant text AND a real failure → 5xx + # with OpenAI-style error envelope so SDK clients raise instead of + # silently rendering the internal failure string as message.content. + if not final_response and (is_failed or is_partial): + err_body = _openai_error( + err_msg or "Agent run did not produce a response.", + err_type="server_error", + code="agent_incomplete", + ) + err_body["error"]["hermes"] = { + "completed": completed, + "partial": is_partial, + "failed": is_failed, + } + response_headers["X-Hermes-Completed"] = "false" + response_headers["X-Hermes-Partial"] = "true" if is_partial else "false" + return web.json_response(err_body, status=502, headers=response_headers) + + # Soft-partial path: we have *some* text but the run did not complete + # (e.g. truncation with partial buffered output). Still 200 but signal + # truncation via finish_reason="length" + Hermes-specific extras. response_data = { "id": completion_id, "object": "chat.completion", @@ -1222,7 +1261,7 @@ class APIServerAdapter(BasePlatformAdapter): "role": "assistant", "content": final_response, }, - "finish_reason": "stop", + "finish_reason": finish_reason, } ], "usage": { @@ -1231,12 +1270,19 @@ class APIServerAdapter(BasePlatformAdapter): "total_tokens": usage.get("total_tokens", 0), }, } + if is_partial or is_failed or not completed: + response_data["hermes"] = { + "completed": completed, + "partial": is_partial, + "failed": is_failed, + "error": err_msg, + "error_code": "output_truncated" if finish_reason == "length" else "agent_error", + } + response_headers["X-Hermes-Completed"] = "false" + response_headers["X-Hermes-Partial"] = "true" if is_partial else "false" + if err_msg: + response_headers["X-Hermes-Error"] = err_msg[:200] - response_headers = { - "X-Hermes-Session-Id": result.get("session_id", session_id), - } - if gateway_session_key: - response_headers["X-Hermes-Session-Key"] = gateway_session_key return web.json_response(response_data, headers=response_headers) async def _write_sse_chat_completion( diff --git a/tests/gateway/test_api_server.py b/tests/gateway/test_api_server.py index 5170a1736a9..9e00a375871 100644 --- a/tests/gateway/test_api_server.py +++ b/tests/gateway/test_api_server.py @@ -2418,6 +2418,109 @@ class TestTruncation: assert len(call_kwargs["conversation_history"]) == 150 +# --------------------------------------------------------------------------- +# Response-side truncation / failure handling (issue #22496) +# --------------------------------------------------------------------------- + + +class TestChatCompletionsAgentIncomplete: + """When the agent run yields a partial / failed result, the API server + must NOT pretend it succeeded. Either signal truncation via + finish_reason='length' (with the partial text), or 502 with an OpenAI + error envelope (no usable text). Issue #22496.""" + + @pytest.mark.asyncio + async def test_truncation_with_partial_text_uses_length_finish_reason(self, adapter): + """Partial text + truncation marker → finish_reason='length', 200 OK, + plus hermes extras + headers.""" + mock_result = { + "final_response": "Here is part one of the answer", + "completed": False, + "partial": True, + "error": "Response truncated due to output length limit", + "messages": [], + "api_calls": 1, + } + app = _create_app(adapter) + async with TestClient(TestServer(app)) as cli: + with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run: + mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}) + resp = await cli.post( + "/v1/chat/completions", + json={"model": "hermes-agent", "messages": [{"role": "user", "content": "tell me everything"}]}, + ) + assert resp.status == 200 + data = await resp.json() + assert data["choices"][0]["finish_reason"] == "length" + assert data["choices"][0]["message"]["content"] == "Here is part one of the answer" + assert data["hermes"]["partial"] is True + assert data["hermes"]["completed"] is False + assert data["hermes"]["error_code"] == "output_truncated" + assert resp.headers.get("X-Hermes-Completed") == "false" + assert resp.headers.get("X-Hermes-Partial") == "true" + + @pytest.mark.asyncio + async def test_failure_with_no_text_returns_502_error_envelope(self, adapter): + """No usable assistant text + failure → 502 with OpenAI error envelope. + + Pre-fix behavior: the failure string ('Response remained truncated...') + was substituted into message.content with finish_reason='stop', + making API clients think the agent had answered. + """ + mock_result = { + "final_response": None, + "completed": False, + "partial": True, + "failed": True, + "error": "Response remained truncated after 3 continuation attempts", + "messages": [], + "api_calls": 1, + } + app = _create_app(adapter) + async with TestClient(TestServer(app)) as cli: + with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run: + mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}) + resp = await cli.post( + "/v1/chat/completions", + json={"model": "hermes-agent", "messages": [{"role": "user", "content": "x"}]}, + ) + # Hard fail: SDK clients will raise on this status + assert resp.status == 502 + data = await resp.json() + assert data["error"]["code"] == "agent_incomplete" + assert "truncated" in data["error"]["message"].lower() + assert data["error"]["hermes"]["partial"] is True + assert data["error"]["hermes"]["failed"] is True + assert resp.headers.get("X-Hermes-Completed") == "false" + + @pytest.mark.asyncio + async def test_normal_completion_unchanged(self, adapter): + """Sanity: a completed-True result still returns finish_reason='stop' + and no hermes extras (preserves the existing happy-path contract).""" + mock_result = { + "final_response": "All good.", + "completed": True, + "partial": False, + "failed": False, + "messages": [], + "api_calls": 1, + } + app = _create_app(adapter) + async with TestClient(TestServer(app)) as cli: + with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run: + mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}) + resp = await cli.post( + "/v1/chat/completions", + json={"model": "hermes-agent", "messages": [{"role": "user", "content": "hi"}]}, + ) + assert resp.status == 200 + data = await resp.json() + assert data["choices"][0]["finish_reason"] == "stop" + assert data["choices"][0]["message"]["content"] == "All good." + assert "hermes" not in data + assert "X-Hermes-Completed" not in resp.headers + + # --------------------------------------------------------------------------- # CORS # ---------------------------------------------------------------------------