mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(api-server): emit length/error finish_reason for truncation/failure (#22775)
Non-streaming /v1/chat/completions wrapped any AIAgent result \u2014 including partial/failed runs \u2014 as a successful 200 with finish_reason='stop' and the internal failure string substituted into message.content. API clients had no way to distinguish 'agent answered: X' from 'agent crashed and the X you see is its error message'. After the fix: - completed: True \u2192 200 finish_reason='stop' (unchanged) - partial + truncated text \u2192 200 finish_reason='length' + hermes extras - partial + no text / failed \u2192 502 OpenAI error envelope (SDKs raise) - other failures \u2192 200 finish_reason='error' + hermes extras Adds X-Hermes-Completed / X-Hermes-Partial / X-Hermes-Error headers plus a 'hermes' extras object on partial responses for clients that want the full picture. Closes #22496.
This commit is contained in:
parent
86f69e8c2a
commit
2124ad72a2
2 changed files with 158 additions and 9 deletions
|
|
@ -1206,10 +1206,49 @@ class APIServerAdapter(BasePlatformAdapter):
|
|||
status=500,
|
||||
)
|
||||
|
||||
final_response = result.get("final_response", "")
|
||||
if not final_response:
|
||||
final_response = result.get("error", "(No response generated)")
|
||||
final_response = result.get("final_response") or ""
|
||||
is_partial = bool(result.get("partial"))
|
||||
is_failed = bool(result.get("failed"))
|
||||
completed = bool(result.get("completed", True))
|
||||
err_msg = result.get("error")
|
||||
|
||||
# Decide finish_reason. OpenAI uses "length" for truncation, "stop"
|
||||
# for normal completion, and downstream SDKs accept "error" / custom
|
||||
# codes. See issue #22496.
|
||||
if is_partial and err_msg and "truncat" in err_msg.lower():
|
||||
finish_reason = "length"
|
||||
elif is_failed or (not completed and err_msg):
|
||||
finish_reason = "error"
|
||||
else:
|
||||
finish_reason = "stop"
|
||||
|
||||
response_headers = {
|
||||
"X-Hermes-Session-Id": result.get("session_id", session_id),
|
||||
}
|
||||
if gateway_session_key:
|
||||
response_headers["X-Hermes-Session-Key"] = gateway_session_key
|
||||
|
||||
# Hard-fail path: no usable assistant text AND a real failure → 5xx
|
||||
# with OpenAI-style error envelope so SDK clients raise instead of
|
||||
# silently rendering the internal failure string as message.content.
|
||||
if not final_response and (is_failed or is_partial):
|
||||
err_body = _openai_error(
|
||||
err_msg or "Agent run did not produce a response.",
|
||||
err_type="server_error",
|
||||
code="agent_incomplete",
|
||||
)
|
||||
err_body["error"]["hermes"] = {
|
||||
"completed": completed,
|
||||
"partial": is_partial,
|
||||
"failed": is_failed,
|
||||
}
|
||||
response_headers["X-Hermes-Completed"] = "false"
|
||||
response_headers["X-Hermes-Partial"] = "true" if is_partial else "false"
|
||||
return web.json_response(err_body, status=502, headers=response_headers)
|
||||
|
||||
# Soft-partial path: we have *some* text but the run did not complete
|
||||
# (e.g. truncation with partial buffered output). Still 200 but signal
|
||||
# truncation via finish_reason="length" + Hermes-specific extras.
|
||||
response_data = {
|
||||
"id": completion_id,
|
||||
"object": "chat.completion",
|
||||
|
|
@ -1222,7 +1261,7 @@ class APIServerAdapter(BasePlatformAdapter):
|
|||
"role": "assistant",
|
||||
"content": final_response,
|
||||
},
|
||||
"finish_reason": "stop",
|
||||
"finish_reason": finish_reason,
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
|
|
@ -1231,12 +1270,19 @@ class APIServerAdapter(BasePlatformAdapter):
|
|||
"total_tokens": usage.get("total_tokens", 0),
|
||||
},
|
||||
}
|
||||
if is_partial or is_failed or not completed:
|
||||
response_data["hermes"] = {
|
||||
"completed": completed,
|
||||
"partial": is_partial,
|
||||
"failed": is_failed,
|
||||
"error": err_msg,
|
||||
"error_code": "output_truncated" if finish_reason == "length" else "agent_error",
|
||||
}
|
||||
response_headers["X-Hermes-Completed"] = "false"
|
||||
response_headers["X-Hermes-Partial"] = "true" if is_partial else "false"
|
||||
if err_msg:
|
||||
response_headers["X-Hermes-Error"] = err_msg[:200]
|
||||
|
||||
response_headers = {
|
||||
"X-Hermes-Session-Id": result.get("session_id", session_id),
|
||||
}
|
||||
if gateway_session_key:
|
||||
response_headers["X-Hermes-Session-Key"] = gateway_session_key
|
||||
return web.json_response(response_data, headers=response_headers)
|
||||
|
||||
async def _write_sse_chat_completion(
|
||||
|
|
|
|||
|
|
@ -2418,6 +2418,109 @@ class TestTruncation:
|
|||
assert len(call_kwargs["conversation_history"]) == 150
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Response-side truncation / failure handling (issue #22496)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestChatCompletionsAgentIncomplete:
|
||||
"""When the agent run yields a partial / failed result, the API server
|
||||
must NOT pretend it succeeded. Either signal truncation via
|
||||
finish_reason='length' (with the partial text), or 502 with an OpenAI
|
||||
error envelope (no usable text). Issue #22496."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_truncation_with_partial_text_uses_length_finish_reason(self, adapter):
|
||||
"""Partial text + truncation marker → finish_reason='length', 200 OK,
|
||||
plus hermes extras + headers."""
|
||||
mock_result = {
|
||||
"final_response": "Here is part one of the answer",
|
||||
"completed": False,
|
||||
"partial": True,
|
||||
"error": "Response truncated due to output length limit",
|
||||
"messages": [],
|
||||
"api_calls": 1,
|
||||
}
|
||||
app = _create_app(adapter)
|
||||
async with TestClient(TestServer(app)) as cli:
|
||||
with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
|
||||
mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
|
||||
resp = await cli.post(
|
||||
"/v1/chat/completions",
|
||||
json={"model": "hermes-agent", "messages": [{"role": "user", "content": "tell me everything"}]},
|
||||
)
|
||||
assert resp.status == 200
|
||||
data = await resp.json()
|
||||
assert data["choices"][0]["finish_reason"] == "length"
|
||||
assert data["choices"][0]["message"]["content"] == "Here is part one of the answer"
|
||||
assert data["hermes"]["partial"] is True
|
||||
assert data["hermes"]["completed"] is False
|
||||
assert data["hermes"]["error_code"] == "output_truncated"
|
||||
assert resp.headers.get("X-Hermes-Completed") == "false"
|
||||
assert resp.headers.get("X-Hermes-Partial") == "true"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_failure_with_no_text_returns_502_error_envelope(self, adapter):
|
||||
"""No usable assistant text + failure → 502 with OpenAI error envelope.
|
||||
|
||||
Pre-fix behavior: the failure string ('Response remained truncated...')
|
||||
was substituted into message.content with finish_reason='stop',
|
||||
making API clients think the agent had answered.
|
||||
"""
|
||||
mock_result = {
|
||||
"final_response": None,
|
||||
"completed": False,
|
||||
"partial": True,
|
||||
"failed": True,
|
||||
"error": "Response remained truncated after 3 continuation attempts",
|
||||
"messages": [],
|
||||
"api_calls": 1,
|
||||
}
|
||||
app = _create_app(adapter)
|
||||
async with TestClient(TestServer(app)) as cli:
|
||||
with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
|
||||
mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
|
||||
resp = await cli.post(
|
||||
"/v1/chat/completions",
|
||||
json={"model": "hermes-agent", "messages": [{"role": "user", "content": "x"}]},
|
||||
)
|
||||
# Hard fail: SDK clients will raise on this status
|
||||
assert resp.status == 502
|
||||
data = await resp.json()
|
||||
assert data["error"]["code"] == "agent_incomplete"
|
||||
assert "truncated" in data["error"]["message"].lower()
|
||||
assert data["error"]["hermes"]["partial"] is True
|
||||
assert data["error"]["hermes"]["failed"] is True
|
||||
assert resp.headers.get("X-Hermes-Completed") == "false"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_normal_completion_unchanged(self, adapter):
|
||||
"""Sanity: a completed-True result still returns finish_reason='stop'
|
||||
and no hermes extras (preserves the existing happy-path contract)."""
|
||||
mock_result = {
|
||||
"final_response": "All good.",
|
||||
"completed": True,
|
||||
"partial": False,
|
||||
"failed": False,
|
||||
"messages": [],
|
||||
"api_calls": 1,
|
||||
}
|
||||
app = _create_app(adapter)
|
||||
async with TestClient(TestServer(app)) as cli:
|
||||
with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
|
||||
mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
|
||||
resp = await cli.post(
|
||||
"/v1/chat/completions",
|
||||
json={"model": "hermes-agent", "messages": [{"role": "user", "content": "hi"}]},
|
||||
)
|
||||
assert resp.status == 200
|
||||
data = await resp.json()
|
||||
assert data["choices"][0]["finish_reason"] == "stop"
|
||||
assert data["choices"][0]["message"]["content"] == "All good."
|
||||
assert "hermes" not in data
|
||||
assert "X-Hermes-Completed" not in resp.headers
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CORS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue