mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
Codex / Responses-API requests had three latent timeout bugs that combined into the long silent hangs reported on #21444: 1. The non-stream stale-call detector estimated context tokens from ``api_kwargs["messages"]`` only. Codex / Responses-API payloads carry their conversational load in ``input`` (with ``instructions`` and ``tools``), so every Codex turn logged ``context=~0 tokens`` and the detector never applied its >50k / >100k tier bumps. 2. ``providers.<id>.request_timeout_seconds`` was silently dropped on the main Codex path. The chat_completions path and the auxiliary Codex adapter both forwarded it; the main path skipped it through three places (``build_api_kwargs``, ``ResponsesApiTransport.build_kwargs``, ``_preflight_codex_api_kwargs``). 3. The streaming stale detector had the same payload-shape bug for ``codex_responses`` requests, which route through the non-streaming detector (it's the path that emits the user-facing "No response from provider for 300s (non-streaming, ...)" warning that reporters keep pasting). This commit: - Adds ``estimate_request_context_tokens`` in ``chat_completion_helpers``, used by both the non-stream and stream detectors. Handles ``messages`` (Chat Completions), ``input + instructions + tools`` (Responses API), bare lists, and an unknown-dict fallback. - Forwards ``timeout`` through ``ResponsesApiTransport.build_kwargs`` and ``_preflight_codex_api_kwargs`` (with guards against zero/negative/inf/bool values), and wires ``_resolved_api_call_timeout()`` into the Codex branch of ``build_api_kwargs``. - Lowers the implicit non-stream stale defaults so fallback providers kick in faster when upstream stalls: * base 300s -> 90s * >50k 450s -> 150s * >100k 600s -> 240s These only apply when the user has *not* set ``providers.<id>.stale_timeout_seconds`` or ``HERMES_API_CALL_STALE_TIMEOUT``. Explicit config still wins. - Adds regression tests for the estimator shapes, the new defaults, the context-tier scaling, transport timeout pass-through, and preflight timeout pass-through / rejection of invalid values. Closes #21444 Supersedes #21652 #24126 #31855 Co-authored-by: Hoang V. Pham <26063003+hehehe0803@users.noreply.github.com>
209 lines
7.2 KiB
Python
209 lines
7.2 KiB
Python
import sys
|
|
import threading
|
|
import time
|
|
import types
|
|
from types import SimpleNamespace
|
|
|
|
import httpx
|
|
import pytest
|
|
from openai import APIConnectionError
|
|
|
|
sys.modules.setdefault("fire", types.SimpleNamespace(Fire=lambda *a, **k: None))
|
|
sys.modules.setdefault("firecrawl", types.SimpleNamespace(Firecrawl=object))
|
|
sys.modules.setdefault("fal_client", types.SimpleNamespace())
|
|
|
|
import run_agent
|
|
|
|
|
|
class FakeRequestClient:
|
|
def __init__(self, responder):
|
|
self._responder = responder
|
|
self._client = SimpleNamespace(is_closed=False)
|
|
self.chat = SimpleNamespace(
|
|
completions=SimpleNamespace(create=self._create)
|
|
)
|
|
self.responses = SimpleNamespace()
|
|
self.close_calls = 0
|
|
|
|
def _create(self, **kwargs):
|
|
return self._responder(**kwargs)
|
|
|
|
def close(self):
|
|
self.close_calls += 1
|
|
self._client.is_closed = True
|
|
|
|
|
|
class FakeSharedClient(FakeRequestClient):
|
|
pass
|
|
|
|
|
|
class OpenAIFactory:
|
|
def __init__(self, clients):
|
|
self._clients = list(clients)
|
|
self.calls = []
|
|
|
|
def __call__(self, **kwargs):
|
|
self.calls.append(dict(kwargs))
|
|
if not self._clients:
|
|
raise AssertionError("OpenAI factory exhausted")
|
|
return self._clients.pop(0)
|
|
|
|
|
|
def _build_agent(shared_client=None):
|
|
agent = run_agent.AIAgent.__new__(run_agent.AIAgent)
|
|
agent.api_mode = "chat_completions"
|
|
agent.provider = "openai-codex"
|
|
agent.base_url = "https://chatgpt.com/backend-api/codex"
|
|
agent.model = "gpt-5-codex"
|
|
agent.log_prefix = ""
|
|
agent.quiet_mode = True
|
|
agent._interrupt_requested = False
|
|
agent._interrupt_message = None
|
|
agent._client_lock = threading.RLock()
|
|
agent._client_kwargs = {"api_key": "***", "base_url": agent.base_url}
|
|
agent.client = shared_client or FakeSharedClient(lambda **kwargs: {"shared": True})
|
|
agent.stream_delta_callback = None
|
|
agent._stream_callback = None
|
|
agent.reasoning_callback = None
|
|
agent.status_callback = None
|
|
return agent
|
|
|
|
|
|
def _connection_error():
|
|
return APIConnectionError(
|
|
message="Connection error.",
|
|
request=httpx.Request("POST", "https://example.com/v1/chat/completions"),
|
|
)
|
|
|
|
|
|
def test_retry_after_api_connection_error_recreates_request_client(monkeypatch):
|
|
first_request = FakeRequestClient(lambda **kwargs: (_ for _ in ()).throw(_connection_error()))
|
|
second_request = FakeRequestClient(lambda **kwargs: {"ok": True})
|
|
factory = OpenAIFactory([first_request, second_request])
|
|
monkeypatch.setattr(run_agent, "OpenAI", factory)
|
|
|
|
agent = _build_agent()
|
|
|
|
with pytest.raises(APIConnectionError):
|
|
agent._interruptible_api_call({"model": agent.model, "messages": []})
|
|
|
|
result = agent._interruptible_api_call({"model": agent.model, "messages": []})
|
|
|
|
assert result == {"ok": True}
|
|
assert len(factory.calls) == 2
|
|
assert first_request.close_calls >= 1
|
|
assert second_request.close_calls >= 1
|
|
|
|
|
|
def test_stale_non_stream_close_is_single_owner(monkeypatch):
|
|
def slow_responder(**kwargs):
|
|
time.sleep(0.1)
|
|
raise _connection_error()
|
|
|
|
request_client = FakeRequestClient(slow_responder)
|
|
factory = OpenAIFactory([request_client])
|
|
monkeypatch.setattr(run_agent, "OpenAI", factory)
|
|
|
|
agent = _build_agent()
|
|
agent._compute_non_stream_stale_timeout = lambda api_payload: 0.01
|
|
|
|
with pytest.raises(APIConnectionError):
|
|
agent._interruptible_api_call({"model": agent.model, "messages": []})
|
|
|
|
assert request_client.close_calls == 1
|
|
|
|
|
|
def test_closed_shared_client_is_recreated_before_request(monkeypatch):
|
|
stale_shared = FakeSharedClient(lambda **kwargs: (_ for _ in ()).throw(AssertionError("stale shared client used")))
|
|
stale_shared._client.is_closed = True
|
|
|
|
replacement_shared = FakeSharedClient(lambda **kwargs: {"replacement": True})
|
|
request_client = FakeRequestClient(lambda **kwargs: {"ok": "fresh-request-client"})
|
|
factory = OpenAIFactory([replacement_shared, request_client])
|
|
monkeypatch.setattr(run_agent, "OpenAI", factory)
|
|
|
|
agent = _build_agent(shared_client=stale_shared)
|
|
result = agent._interruptible_api_call({"model": agent.model, "messages": []})
|
|
|
|
assert result == {"ok": "fresh-request-client"}
|
|
assert agent.client is replacement_shared
|
|
assert stale_shared.close_calls >= 1
|
|
assert replacement_shared.close_calls == 0
|
|
assert len(factory.calls) == 2
|
|
|
|
|
|
def test_concurrent_requests_do_not_break_each_other_when_one_client_closes(monkeypatch):
|
|
first_started = threading.Event()
|
|
first_closed = threading.Event()
|
|
|
|
def first_responder(**kwargs):
|
|
first_started.set()
|
|
first_client.close()
|
|
first_closed.set()
|
|
raise _connection_error()
|
|
|
|
def second_responder(**kwargs):
|
|
assert first_started.wait(timeout=2)
|
|
assert first_closed.wait(timeout=2)
|
|
return {"ok": "second"}
|
|
|
|
first_client = FakeRequestClient(first_responder)
|
|
second_client = FakeRequestClient(second_responder)
|
|
factory = OpenAIFactory([first_client, second_client])
|
|
monkeypatch.setattr(run_agent, "OpenAI", factory)
|
|
|
|
agent = _build_agent()
|
|
results = {}
|
|
|
|
def run_call(name):
|
|
try:
|
|
results[name] = agent._interruptible_api_call({"model": agent.model, "messages": []})
|
|
except Exception as exc: # noqa: BLE001 - asserting exact type below
|
|
results[name] = exc
|
|
|
|
thread_one = threading.Thread(target=run_call, args=("first",), daemon=True)
|
|
thread_two = threading.Thread(target=run_call, args=("second",), daemon=True)
|
|
thread_one.start()
|
|
thread_two.start()
|
|
thread_one.join(timeout=5)
|
|
thread_two.join(timeout=5)
|
|
|
|
values = list(results.values())
|
|
assert sum(isinstance(value, APIConnectionError) for value in values) == 1
|
|
assert values.count({"ok": "second"}) == 1
|
|
assert len(factory.calls) == 2
|
|
|
|
|
|
|
|
def test_streaming_call_recreates_closed_shared_client_before_request(monkeypatch):
|
|
chunks = iter([
|
|
SimpleNamespace(
|
|
model="gpt-5-codex",
|
|
choices=[SimpleNamespace(delta=SimpleNamespace(content="Hello", tool_calls=None), finish_reason=None)],
|
|
),
|
|
SimpleNamespace(
|
|
model="gpt-5-codex",
|
|
choices=[SimpleNamespace(delta=SimpleNamespace(content=" world", tool_calls=None), finish_reason="stop")],
|
|
),
|
|
])
|
|
|
|
stale_shared = FakeSharedClient(lambda **kwargs: (_ for _ in ()).throw(AssertionError("stale shared client used")))
|
|
stale_shared._client.is_closed = True
|
|
|
|
replacement_shared = FakeSharedClient(lambda **kwargs: {"replacement": True})
|
|
request_client = FakeRequestClient(lambda **kwargs: chunks)
|
|
factory = OpenAIFactory([replacement_shared, request_client])
|
|
monkeypatch.setattr(run_agent, "OpenAI", factory)
|
|
|
|
agent = _build_agent(shared_client=stale_shared)
|
|
agent.stream_delta_callback = lambda _delta: None
|
|
# Force chat_completions mode so the streaming path uses
|
|
# chat.completions.create(stream=True) instead of Codex responses.stream()
|
|
agent.api_mode = "chat_completions"
|
|
response = agent._interruptible_streaming_api_call({"model": agent.model, "messages": []})
|
|
|
|
assert response.choices[0].message.content == "Hello world"
|
|
assert agent.client is replacement_shared
|
|
assert stale_shared.close_calls >= 1
|
|
assert request_client.close_calls >= 1
|
|
assert len(factory.calls) == 2
|