hermes-agent/tests/run_agent/test_openai_client_lifecycle.py
Kasun Athaudahetti 2d422720b5 fix(codex): size and propagate timeouts for Responses-API requests; lower stale defaults
Codex / Responses-API requests had three latent timeout bugs that combined
into the long silent hangs reported on #21444:

1. The non-stream stale-call detector estimated context tokens from
   ``api_kwargs["messages"]`` only. Codex / Responses-API payloads carry
   their conversational load in ``input`` (with ``instructions`` and
   ``tools``), so every Codex turn logged ``context=~0 tokens`` and the
   detector never applied its >50k / >100k tier bumps.

2. ``providers.<id>.request_timeout_seconds`` was silently dropped on the
   main Codex path. The chat_completions path and the auxiliary Codex
   adapter both forwarded it; the main path skipped it through three
   places (``build_api_kwargs``, ``ResponsesApiTransport.build_kwargs``,
   ``_preflight_codex_api_kwargs``).

3. The streaming stale detector had the same payload-shape bug for
   ``codex_responses`` requests, which route through the non-streaming
   detector (it's the path that emits the user-facing
   "No response from provider for 300s (non-streaming, ...)" warning that
   reporters keep pasting).

This commit:

- Adds ``estimate_request_context_tokens`` in ``chat_completion_helpers``,
  used by both the non-stream and stream detectors. Handles ``messages``
  (Chat Completions), ``input + instructions + tools`` (Responses API),
  bare lists, and an unknown-dict fallback.
- Forwards ``timeout`` through ``ResponsesApiTransport.build_kwargs``
  and ``_preflight_codex_api_kwargs`` (with guards against
  zero/negative/inf/bool values), and wires
  ``_resolved_api_call_timeout()`` into the Codex branch of
  ``build_api_kwargs``.
- Lowers the implicit non-stream stale defaults so fallback providers
  kick in faster when upstream stalls:
    * base   300s -> 90s
    * >50k   450s -> 150s
    * >100k  600s -> 240s
  These only apply when the user has *not* set
  ``providers.<id>.stale_timeout_seconds`` or
  ``HERMES_API_CALL_STALE_TIMEOUT``. Explicit config still wins.
- Adds regression tests for the estimator shapes, the new defaults, the
  context-tier scaling, transport timeout pass-through, and preflight
  timeout pass-through / rejection of invalid values.

Closes #21444
Supersedes #21652 #24126 #31855

Co-authored-by: Hoang V. Pham <26063003+hehehe0803@users.noreply.github.com>
2026-05-25 01:47:55 -07:00

209 lines
7.2 KiB
Python

import sys
import threading
import time
import types
from types import SimpleNamespace
import httpx
import pytest
from openai import APIConnectionError
sys.modules.setdefault("fire", types.SimpleNamespace(Fire=lambda *a, **k: None))
sys.modules.setdefault("firecrawl", types.SimpleNamespace(Firecrawl=object))
sys.modules.setdefault("fal_client", types.SimpleNamespace())
import run_agent
class FakeRequestClient:
def __init__(self, responder):
self._responder = responder
self._client = SimpleNamespace(is_closed=False)
self.chat = SimpleNamespace(
completions=SimpleNamespace(create=self._create)
)
self.responses = SimpleNamespace()
self.close_calls = 0
def _create(self, **kwargs):
return self._responder(**kwargs)
def close(self):
self.close_calls += 1
self._client.is_closed = True
class FakeSharedClient(FakeRequestClient):
pass
class OpenAIFactory:
def __init__(self, clients):
self._clients = list(clients)
self.calls = []
def __call__(self, **kwargs):
self.calls.append(dict(kwargs))
if not self._clients:
raise AssertionError("OpenAI factory exhausted")
return self._clients.pop(0)
def _build_agent(shared_client=None):
agent = run_agent.AIAgent.__new__(run_agent.AIAgent)
agent.api_mode = "chat_completions"
agent.provider = "openai-codex"
agent.base_url = "https://chatgpt.com/backend-api/codex"
agent.model = "gpt-5-codex"
agent.log_prefix = ""
agent.quiet_mode = True
agent._interrupt_requested = False
agent._interrupt_message = None
agent._client_lock = threading.RLock()
agent._client_kwargs = {"api_key": "***", "base_url": agent.base_url}
agent.client = shared_client or FakeSharedClient(lambda **kwargs: {"shared": True})
agent.stream_delta_callback = None
agent._stream_callback = None
agent.reasoning_callback = None
agent.status_callback = None
return agent
def _connection_error():
return APIConnectionError(
message="Connection error.",
request=httpx.Request("POST", "https://example.com/v1/chat/completions"),
)
def test_retry_after_api_connection_error_recreates_request_client(monkeypatch):
first_request = FakeRequestClient(lambda **kwargs: (_ for _ in ()).throw(_connection_error()))
second_request = FakeRequestClient(lambda **kwargs: {"ok": True})
factory = OpenAIFactory([first_request, second_request])
monkeypatch.setattr(run_agent, "OpenAI", factory)
agent = _build_agent()
with pytest.raises(APIConnectionError):
agent._interruptible_api_call({"model": agent.model, "messages": []})
result = agent._interruptible_api_call({"model": agent.model, "messages": []})
assert result == {"ok": True}
assert len(factory.calls) == 2
assert first_request.close_calls >= 1
assert second_request.close_calls >= 1
def test_stale_non_stream_close_is_single_owner(monkeypatch):
def slow_responder(**kwargs):
time.sleep(0.1)
raise _connection_error()
request_client = FakeRequestClient(slow_responder)
factory = OpenAIFactory([request_client])
monkeypatch.setattr(run_agent, "OpenAI", factory)
agent = _build_agent()
agent._compute_non_stream_stale_timeout = lambda api_payload: 0.01
with pytest.raises(APIConnectionError):
agent._interruptible_api_call({"model": agent.model, "messages": []})
assert request_client.close_calls == 1
def test_closed_shared_client_is_recreated_before_request(monkeypatch):
stale_shared = FakeSharedClient(lambda **kwargs: (_ for _ in ()).throw(AssertionError("stale shared client used")))
stale_shared._client.is_closed = True
replacement_shared = FakeSharedClient(lambda **kwargs: {"replacement": True})
request_client = FakeRequestClient(lambda **kwargs: {"ok": "fresh-request-client"})
factory = OpenAIFactory([replacement_shared, request_client])
monkeypatch.setattr(run_agent, "OpenAI", factory)
agent = _build_agent(shared_client=stale_shared)
result = agent._interruptible_api_call({"model": agent.model, "messages": []})
assert result == {"ok": "fresh-request-client"}
assert agent.client is replacement_shared
assert stale_shared.close_calls >= 1
assert replacement_shared.close_calls == 0
assert len(factory.calls) == 2
def test_concurrent_requests_do_not_break_each_other_when_one_client_closes(monkeypatch):
first_started = threading.Event()
first_closed = threading.Event()
def first_responder(**kwargs):
first_started.set()
first_client.close()
first_closed.set()
raise _connection_error()
def second_responder(**kwargs):
assert first_started.wait(timeout=2)
assert first_closed.wait(timeout=2)
return {"ok": "second"}
first_client = FakeRequestClient(first_responder)
second_client = FakeRequestClient(second_responder)
factory = OpenAIFactory([first_client, second_client])
monkeypatch.setattr(run_agent, "OpenAI", factory)
agent = _build_agent()
results = {}
def run_call(name):
try:
results[name] = agent._interruptible_api_call({"model": agent.model, "messages": []})
except Exception as exc: # noqa: BLE001 - asserting exact type below
results[name] = exc
thread_one = threading.Thread(target=run_call, args=("first",), daemon=True)
thread_two = threading.Thread(target=run_call, args=("second",), daemon=True)
thread_one.start()
thread_two.start()
thread_one.join(timeout=5)
thread_two.join(timeout=5)
values = list(results.values())
assert sum(isinstance(value, APIConnectionError) for value in values) == 1
assert values.count({"ok": "second"}) == 1
assert len(factory.calls) == 2
def test_streaming_call_recreates_closed_shared_client_before_request(monkeypatch):
chunks = iter([
SimpleNamespace(
model="gpt-5-codex",
choices=[SimpleNamespace(delta=SimpleNamespace(content="Hello", tool_calls=None), finish_reason=None)],
),
SimpleNamespace(
model="gpt-5-codex",
choices=[SimpleNamespace(delta=SimpleNamespace(content=" world", tool_calls=None), finish_reason="stop")],
),
])
stale_shared = FakeSharedClient(lambda **kwargs: (_ for _ in ()).throw(AssertionError("stale shared client used")))
stale_shared._client.is_closed = True
replacement_shared = FakeSharedClient(lambda **kwargs: {"replacement": True})
request_client = FakeRequestClient(lambda **kwargs: chunks)
factory = OpenAIFactory([replacement_shared, request_client])
monkeypatch.setattr(run_agent, "OpenAI", factory)
agent = _build_agent(shared_client=stale_shared)
agent.stream_delta_callback = lambda _delta: None
# Force chat_completions mode so the streaming path uses
# chat.completions.create(stream=True) instead of Codex responses.stream()
agent.api_mode = "chat_completions"
response = agent._interruptible_streaming_api_call({"model": agent.model, "messages": []})
assert response.choices[0].message.content == "Hello world"
assert agent.client is replacement_shared
assert stale_shared.close_calls >= 1
assert request_client.close_calls >= 1
assert len(factory.calls) == 2