feat: enhance Responses API — retrieval, deletion, tool calls, usage, CORS

Cherry-picked from PR #828.
This commit is contained in:
teknium1 2026-03-11 08:53:54 -07:00
parent 58dc5c4af1
commit 7d771c2b1b
2 changed files with 526 additions and 50 deletions

View file

@ -2,10 +2,12 @@
OpenAI-compatible API server platform adapter.
Exposes an HTTP server with endpoints:
- POST /v1/chat/completions OpenAI Chat Completions format (stateless)
- POST /v1/responses OpenAI Responses API format (stateful via previous_response_id)
- GET /v1/models lists hermes-agent as an available model
- GET /health health check
- POST /v1/chat/completions OpenAI Chat Completions format (stateless)
- POST /v1/responses OpenAI Responses API format (stateful via previous_response_id)
- GET /v1/responses/{response_id} Retrieve a stored response
- DELETE /v1/responses/{response_id} Delete a stored response
- GET /v1/models lists hermes-agent as an available model
- GET /health health check
Any OpenAI-compatible frontend (Open WebUI, LobeChat, etc.) can connect
to hermes-agent through this adapter.
@ -82,10 +84,41 @@ class ResponseStore:
while len(self._store) > self._max_size:
self._store.popitem(last=False)
def delete(self, response_id: str) -> bool:
"""Remove a response from the store. Returns True if found and deleted."""
if response_id in self._store:
del self._store[response_id]
return True
return False
def __len__(self) -> int:
return len(self._store)
# ---------------------------------------------------------------------------
# CORS middleware
# ---------------------------------------------------------------------------
_CORS_HEADERS = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET, POST, DELETE, OPTIONS",
"Access-Control-Allow-Headers": "Authorization, Content-Type",
}
if AIOHTTP_AVAILABLE:
@web.middleware
async def cors_middleware(request, handler):
"""Add CORS headers to every response; handle OPTIONS preflight."""
if request.method == "OPTIONS":
return web.Response(status=200, headers=_CORS_HEADERS)
response = await handler(request)
response.headers.update(_CORS_HEADERS)
return response
else:
cors_middleware = None # type: ignore[assignment]
class APIServerAdapter(BasePlatformAdapter):
"""
OpenAI-compatible HTTP API server adapter.
@ -271,7 +304,7 @@ class APIServerAdapter(BasePlatformAdapter):
# Run the agent in an executor (run_conversation is synchronous)
session_id = str(uuid.uuid4())
try:
result = await self._run_agent(
result, usage = await self._run_agent(
user_message=user_message,
conversation_history=history,
ephemeral_system_prompt=system_prompt,
@ -305,9 +338,9 @@ class APIServerAdapter(BasePlatformAdapter):
}
],
"usage": {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
"prompt_tokens": usage.get("input_tokens", 0),
"completion_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
@ -394,10 +427,14 @@ class APIServerAdapter(BasePlatformAdapter):
status=400,
)
# Truncation support
if body.get("truncation") == "auto" and len(conversation_history) > 100:
conversation_history = conversation_history[-100:]
# Run the agent
session_id = str(uuid.uuid4())
try:
result = await self._run_agent(
result, usage = await self._run_agent(
user_message=user_message,
conversation_history=conversation_history,
ephemeral_system_prompt=instructions,
@ -428,15 +465,8 @@ class APIServerAdapter(BasePlatformAdapter):
else:
full_history.append({"role": "assistant", "content": final_response})
# Store response for future chaining
if store:
self._response_store.put(response_id, {
"input": raw_input,
"output": final_response,
"conversation_history": full_history,
"instructions": instructions,
"created_at": created_at,
})
# Build output items (includes tool calls + final message)
output_items = self._extract_output_items(result)
response_data = {
"id": response_id,
@ -444,42 +474,136 @@ class APIServerAdapter(BasePlatformAdapter):
"status": "completed",
"created_at": created_at,
"model": body.get("model", "hermes-agent"),
"output": [
{
"type": "message",
"role": "assistant",
"content": [
{
"type": "output_text",
"text": final_response,
}
],
}
],
"output": output_items,
"usage": {
"input_tokens": 0,
"output_tokens": 0,
"total_tokens": 0,
"input_tokens": usage.get("input_tokens", 0),
"output_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
# Store the complete response object for future chaining / GET retrieval
if store:
self._response_store.put(response_id, {
"response": response_data,
"conversation_history": full_history,
"instructions": instructions,
})
return web.json_response(response_data)
# ------------------------------------------------------------------
# Agent execution
# ------------------------------------------------------------------
# ------------------------------------------------------------------
# GET / DELETE response endpoints
# ------------------------------------------------------------------
async def _handle_get_response(self, request: "web.Request") -> "web.Response":
"""GET /v1/responses/{response_id} — retrieve a stored response."""
auth_err = self._check_auth(request)
if auth_err:
return auth_err
response_id = request.match_info["response_id"]
stored = self._response_store.get(response_id)
if stored is None:
return web.json_response(
{"error": {"message": f"Response not found: {response_id}", "type": "invalid_request_error"}},
status=404,
)
return web.json_response(stored["response"])
async def _handle_delete_response(self, request: "web.Request") -> "web.Response":
"""DELETE /v1/responses/{response_id} — delete a stored response."""
auth_err = self._check_auth(request)
if auth_err:
return auth_err
response_id = request.match_info["response_id"]
deleted = self._response_store.delete(response_id)
if not deleted:
return web.json_response(
{"error": {"message": f"Response not found: {response_id}", "type": "invalid_request_error"}},
status=404,
)
return web.json_response({
"id": response_id,
"object": "response",
"deleted": True,
})
# ------------------------------------------------------------------
# Output extraction helper
# ------------------------------------------------------------------
@staticmethod
def _extract_output_items(result: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Build the full output item array from the agent's messages.
Walks *result["messages"]* and emits:
- ``function_call`` items for each tool_call on assistant messages
- ``function_call_output`` items for each tool-role message
- a final ``message`` item with the assistant's text reply
"""
items: List[Dict[str, Any]] = []
messages = result.get("messages", [])
for msg in messages:
role = msg.get("role")
if role == "assistant" and msg.get("tool_calls"):
for tc in msg["tool_calls"]:
func = tc.get("function", {})
items.append({
"type": "function_call",
"name": func.get("name", ""),
"arguments": func.get("arguments", ""),
"call_id": tc.get("id", ""),
})
elif role == "tool":
items.append({
"type": "function_call_output",
"call_id": msg.get("tool_call_id", ""),
"output": msg.get("content", ""),
})
# Final assistant message
final = result.get("final_response", "")
if not final:
final = result.get("error", "(No response generated)")
items.append({
"type": "message",
"role": "assistant",
"content": [
{
"type": "output_text",
"text": final,
}
],
})
return items
# ------------------------------------------------------------------
# Agent execution
# ------------------------------------------------------------------
async def _run_agent(
self,
user_message: str,
conversation_history: List[Dict[str, str]],
ephemeral_system_prompt: Optional[str] = None,
session_id: Optional[str] = None,
) -> Dict[str, Any]:
) -> tuple:
"""
Create an agent and run a conversation in a thread executor.
run_conversation() is synchronous, so we run it off the event loop.
Returns ``(result_dict, usage_dict)`` where *usage_dict* contains
``input_tokens``, ``output_tokens`` and ``total_tokens``.
"""
loop = asyncio.get_event_loop()
@ -492,7 +616,12 @@ class APIServerAdapter(BasePlatformAdapter):
user_message=user_message,
conversation_history=conversation_history,
)
return result
usage = {
"input_tokens": getattr(agent, "session_prompt_tokens", 0) or 0,
"output_tokens": getattr(agent, "session_completion_tokens", 0) or 0,
"total_tokens": getattr(agent, "session_total_tokens", 0) or 0,
}
return result, usage
return await loop.run_in_executor(None, _run)
@ -507,11 +636,13 @@ class APIServerAdapter(BasePlatformAdapter):
return False
try:
self._app = web.Application()
self._app = web.Application(middlewares=[cors_middleware])
self._app.router.add_get("/health", self._handle_health)
self._app.router.add_get("/v1/models", self._handle_models)
self._app.router.add_post("/v1/chat/completions", self._handle_chat_completions)
self._app.router.add_post("/v1/responses", self._handle_responses)
self._app.router.add_get("/v1/responses/{response_id}", self._handle_get_response)
self._app.router.add_delete("/v1/responses/{response_id}", self._handle_delete_response)
self._runner = web.AppRunner(self._app)
await self._runner.setup()