feat: add streaming token output and simplify CLI to plain stdout

This commit is contained in:
Brooklyn Nicholson 2026-03-09 17:33:28 -05:00
parent c754135965
commit 4d6c90c6d0
7 changed files with 599 additions and 982 deletions

View file

@ -174,6 +174,7 @@ class AIAgent:
tool_progress_callback: callable = None,
clarify_callback: callable = None,
step_callback: callable = None,
stream_delta_callback: callable = None,
max_tokens: int = None,
reasoning_config: Dict[str, Any] = None,
prefill_messages: List[Dict[str, Any]] = None,
@ -258,6 +259,7 @@ class AIAgent:
self.tool_progress_callback = tool_progress_callback
self.clarify_callback = clarify_callback
self.step_callback = step_callback
self.stream_delta_callback = stream_delta_callback
self._last_reported_tool = None # Track for "new tool" mode
# Interrupt mechanism for breaking out of tool loops
@ -2158,6 +2160,137 @@ class AIAgent:
raise result["error"]
return result["response"]
def _interruptible_streaming_api_call(self, api_kwargs: dict, on_first_delta=None):
"""Streaming variant of _interruptible_api_call for chat_completions.
Fires self.stream_delta_callback(text) as content tokens arrive and
accumulates the full response into a SimpleNamespace matching the shape
downstream code expects. Falls back to the non-streaming path when the
provider rejects the stream request.
"""
from types import SimpleNamespace
result = {"response": None, "error": None}
first_delta_fired = [False]
def _stream():
try:
stream_kwargs = {**api_kwargs, "stream": True,
"stream_options": {"include_usage": True}}
stream = self.client.chat.completions.create(**stream_kwargs)
content_parts = []
tool_calls_acc = {}
finish_reason = "stop"
usage = None
reasoning_content = None
model = None
for chunk in stream:
if not chunk.choices:
if hasattr(chunk, "usage") and chunk.usage:
usage = chunk.usage
continue
choice = chunk.choices[0]
if choice.finish_reason:
finish_reason = choice.finish_reason
if model is None and hasattr(chunk, "model"):
model = chunk.model
delta = choice.delta
if delta is None:
continue
if delta.content:
content_parts.append(delta.content)
if not first_delta_fired[0]:
first_delta_fired[0] = True
if on_first_delta:
on_first_delta()
if self.stream_delta_callback:
try:
self.stream_delta_callback(delta.content)
except Exception:
pass
if delta.tool_calls:
for tc_delta in delta.tool_calls:
idx = tc_delta.index
if idx not in tool_calls_acc:
tool_calls_acc[idx] = {
"id": tc_delta.id or "",
"type": tc_delta.type or "function",
"function": {
"name": getattr(tc_delta.function, "name", None) or "",
"arguments": getattr(tc_delta.function, "arguments", None) or "",
},
}
else:
entry = tool_calls_acc[idx]
if tc_delta.id:
entry["id"] = tc_delta.id
fn = tc_delta.function
if fn:
if fn.name:
entry["function"]["name"] = fn.name
if fn.arguments:
entry["function"]["arguments"] += fn.arguments
rc = getattr(delta, "reasoning_content", None) or getattr(delta, "reasoning", None)
if rc:
reasoning_content = (reasoning_content or "") + rc
tool_calls_list = None
if tool_calls_acc:
tool_calls_list = [
SimpleNamespace(
id=tc["id"], call_id=tc["id"], type=tc["type"],
function=SimpleNamespace(name=tc["function"]["name"],
arguments=tc["function"]["arguments"]),
)
for idx, tc in sorted(tool_calls_acc.items())
]
message = SimpleNamespace(
content="".join(content_parts) or None,
tool_calls=tool_calls_list,
reasoning=reasoning_content,
reasoning_content=reasoning_content,
reasoning_details=None,
)
result["response"] = SimpleNamespace(
choices=[SimpleNamespace(message=message, finish_reason=finish_reason)],
usage=usage,
model=model,
)
except Exception as e:
result["error"] = e
t = threading.Thread(target=_stream, daemon=True)
t.start()
while t.is_alive():
t.join(timeout=0.3)
if self._interrupt_requested:
try:
self.client.close()
except Exception:
pass
try:
self.client = OpenAI(**self._client_kwargs)
except Exception:
pass
raise InterruptedError("Agent interrupted during streaming API call")
if result["error"] is not None:
err = result["error"]
err_str = str(err).lower()
if any(kw in err_str for kw in ("stream", "not support", "unsupported")):
logger.debug("Streaming failed (%s), falling back to non-streaming.", err)
return self._interruptible_api_call(api_kwargs)
raise err
return result["response"]
# ── Provider fallback ──────────────────────────────────────────────────
# API-key providers: provider → (base_url, [env_var_names])
@ -3353,12 +3486,24 @@ class AIAgent:
if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}:
self._dump_api_request_debug(api_kwargs, reason="preflight")
response = self._interruptible_api_call(api_kwargs)
if self.stream_delta_callback and self.api_mode != "codex_responses":
def _stop_spinner():
nonlocal thinking_spinner
if thinking_spinner:
thinking_spinner.stop("")
thinking_spinner = None
response = self._interruptible_streaming_api_call(
api_kwargs, on_first_delta=_stop_spinner)
# Newline after streamed content so tool lines don't overwrite it
if response and hasattr(response, 'choices') and response.choices:
msg = response.choices[0].message
if msg and msg.content and msg.tool_calls:
print(flush=True)
else:
response = self._interruptible_api_call(api_kwargs)
api_duration = time.time() - api_start_time
# Stop thinking spinner silently -- the response box or tool
# execution messages that follow are more informative.
if thinking_spinner:
thinking_spinner.stop("")
thinking_spinner = None
@ -4055,8 +4200,8 @@ class AIAgent:
turn_content = assistant_message.content or ""
if turn_content and self._has_content_after_think_block(turn_content):
self._last_content_with_tools = turn_content
# Show intermediate commentary so the user can follow along
if self.quiet_mode:
# Show intermediate commentary — skip when streaming (already in buffer)
if self.quiet_mode and not self.stream_delta_callback:
clean = self._strip_think_blocks(turn_content).strip()
if clean:
print(f" ┊ 💬 {clean}")