fix(api-server): emit length/error finish_reason for truncation/failure (#22775)

Non-streaming /v1/chat/completions wrapped any AIAgent result \u2014 including partial/failed runs \u2014 as a successful 200 with finish_reason='stop' and the internal failure string substituted into message.content. API clients had no way to distinguish 'agent answered: X' from 'agent crashed and the X you see is its error message'. After the fix: - completed: True \u2192 200 finish_reason='stop' (unchanged) - partial + truncated text \u2192 200 finish_reason='length' + hermes extras - partial + no text / failed \u2192 502 OpenAI error envelope (SDKs raise) - other failures \u2192 200 finish_reason='error' + hermes extras Adds X-Hermes-Completed / X-Hermes-Partial / X-Hermes-Error headers plus a 'hermes' extras object on partial responses for clients that want the full picture. Closes #22496.
2026-05-18 04:41:56 +00:00 · 2026-05-09 12:48:08 -07:00 · 2026-05-09 12:48:08 -07:00 · 2124ad72a2
commit 2124ad72a2
parent 86f69e8c2a
2 changed files with 158 additions and 9 deletions
--- a/gateway/platforms/api_server.py
+++ b/gateway/platforms/api_server.py
@ -1206,10 +1206,49 @@ class APIServerAdapter(BasePlatformAdapter):
                    status=500,
                )

-        final_response = result.get("final_response", "")
-        if not final_response:
-            final_response = result.get("error", "(No response generated)")
+        final_response = result.get("final_response") or ""
+        is_partial = bool(result.get("partial"))
+        is_failed = bool(result.get("failed"))
+        completed = bool(result.get("completed", True))
+        err_msg = result.get("error")

+        # Decide finish_reason. OpenAI uses "length" for truncation, "stop"
+        # for normal completion, and downstream SDKs accept "error" / custom
+        # codes. See issue #22496.
+        if is_partial and err_msg and "truncat" in err_msg.lower():
+            finish_reason = "length"
+        elif is_failed or (not completed and err_msg):
+            finish_reason = "error"
+        else:
+            finish_reason = "stop"
+
+        response_headers = {
+            "X-Hermes-Session-Id": result.get("session_id", session_id),
+        }
+        if gateway_session_key:
+            response_headers["X-Hermes-Session-Key"] = gateway_session_key
+
+        # Hard-fail path: no usable assistant text AND a real failure → 5xx
+        # with OpenAI-style error envelope so SDK clients raise instead of
+        # silently rendering the internal failure string as message.content.
+        if not final_response and (is_failed or is_partial):
+            err_body = _openai_error(
+                err_msg or "Agent run did not produce a response.",
+                err_type="server_error",
+                code="agent_incomplete",
+            )
+            err_body["error"]["hermes"] = {
+                "completed": completed,
+                "partial": is_partial,
+                "failed": is_failed,
+            }
+            response_headers["X-Hermes-Completed"] = "false"
+            response_headers["X-Hermes-Partial"] = "true" if is_partial else "false"
+            return web.json_response(err_body, status=502, headers=response_headers)
+
+        # Soft-partial path: we have *some* text but the run did not complete
+        # (e.g. truncation with partial buffered output). Still 200 but signal
+        # truncation via finish_reason="length" + Hermes-specific extras.
        response_data = {
            "id": completion_id,
            "object": "chat.completion",
@ -1222,7 +1261,7 @@ class APIServerAdapter(BasePlatformAdapter):
                        "role": "assistant",
                        "content": final_response,
                    },
-                    "finish_reason": "stop",
+                    "finish_reason": finish_reason,
                }
            ],
            "usage": {
@ -1231,12 +1270,19 @@ class APIServerAdapter(BasePlatformAdapter):
                "total_tokens": usage.get("total_tokens", 0),
            },
        }
+        if is_partial or is_failed or not completed:
+            response_data["hermes"] = {
+                "completed": completed,
+                "partial": is_partial,
+                "failed": is_failed,
+                "error": err_msg,
+                "error_code": "output_truncated" if finish_reason == "length" else "agent_error",
+            }
+            response_headers["X-Hermes-Completed"] = "false"
+            response_headers["X-Hermes-Partial"] = "true" if is_partial else "false"
+            if err_msg:
+                response_headers["X-Hermes-Error"] = err_msg[:200]

-        response_headers = {
-            "X-Hermes-Session-Id": result.get("session_id", session_id),
-        }
-        if gateway_session_key:
-            response_headers["X-Hermes-Session-Key"] = gateway_session_key
        return web.json_response(response_data, headers=response_headers)

    async def _write_sse_chat_completion(