From 83f6a83b2482168aadadf5fc687ee82f42b52b82 Mon Sep 17 00:00:00 2001 From: Fewmanism <170640479+Fewmanism@users.noreply.github.com> Date: Fri, 15 May 2026 04:28:44 +0900 Subject: [PATCH] fix(tui): handle images with codex app-server --- agent/transports/codex_app_server_session.py | 39 ++++++++++++++++++- .../test_codex_app_server_session.py | 39 +++++++++++++++++++ tui_gateway/server.py | 2 + 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/agent/transports/codex_app_server_session.py b/agent/transports/codex_app_server_session.py index d9ee92dfbf5..74e164d64d9 100644 --- a/agent/transports/codex_app_server_session.py +++ b/agent/transports/codex_app_server_session.py @@ -87,6 +87,39 @@ class TurnResult: _TURN_ABORTED_MARKERS = ("", "") +def _coerce_turn_input_text(user_input: Any) -> str: + """Collapse Hermes/OpenAI rich content into app-server text input. + + The current `turn/start` path sends text items only. TUI image attachment + can hand us OpenAI-style content parts, so keep the text/path hints and + replace opaque image payloads with a small marker instead of putting a + Python list into the `text` field. + """ + if isinstance(user_input, str): + return user_input + if isinstance(user_input, list): + parts: list[str] = [] + for item in user_input: + if isinstance(item, str): + if item.strip(): + parts.append(item) + continue + if not isinstance(item, dict): + if item is not None: + parts.append(str(item)) + continue + item_type = item.get("type") + if item_type in {"text", "input_text"}: + text = item.get("text") or item.get("content") or "" + if text: + parts.append(str(text)) + elif item_type in {"image", "image_url", "input_image"}: + parts.append("[image attached]") + text = "\n\n".join(p for p in parts if p).strip() + return text or "What do you see in this image?" + return "" if user_input is None else str(user_input) + + # Substrings in codex stderr / JSON-RPC error messages that signal the # subprocess died because its OAuth credentials are no longer valid. # Kept conservative: we only redirect users to `codex login` when we're @@ -327,7 +360,7 @@ class CodexAppServerSession: def run_turn( self, - user_input: str, + user_input: Any, *, turn_timeout: float = 600.0, notification_poll_timeout: float = 0.25, @@ -365,6 +398,8 @@ class CodexAppServerSession: self._interrupt_event.clear() projector = CodexEventProjector() + user_input_text = _coerce_turn_input_text(user_input) + # Send turn/start with the user input. Text-only for now (codex # supports rich content but Hermes' text path is the common case). try: @@ -372,7 +407,7 @@ class CodexAppServerSession: "turn/start", { "threadId": self._thread_id, - "input": [{"type": "text", "text": user_input}], + "input": [{"type": "text", "text": user_input_text}], }, timeout=10, ) diff --git a/tests/agent/transports/test_codex_app_server_session.py b/tests/agent/transports/test_codex_app_server_session.py index b192d64e1c8..d43a92a1eb9 100644 --- a/tests/agent/transports/test_codex_app_server_session.py +++ b/tests/agent/transports/test_codex_app_server_session.py @@ -20,6 +20,7 @@ from agent.transports.codex_app_server_session import ( TurnResult, _ServerRequestRouting, _approval_choice_to_codex_decision, + _coerce_turn_input_text, ) @@ -128,6 +129,15 @@ class TestApprovalChoiceMapping: assert _approval_choice_to_codex_decision(choice) == expected +class TestTurnInputCoercion: + def test_list_content_keeps_text_and_marks_images(self): + text = _coerce_turn_input_text([ + {"type": "text", "text": "caption"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}, + ]) + assert text == "caption\n\n[image attached]" + + # ---- lifecycle ---- class TestLifecycle: @@ -188,6 +198,35 @@ class TestRunTurn: # turn_id propagated for downstream session-DB linkage assert r.turn_id == "turn-fake-001" + def test_rich_content_turn_is_collapsed_to_text_payload(self): + client = FakeClient() + client.queue_notification( + "turn/completed", + threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + s = make_session(client) + r = s.run_turn( + [ + { + "type": "text", + "text": "look at this\n\n[Image attached at: /tmp/a.png]", + }, + { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,abc"}, + }, + ], + turn_timeout=2.0, + ) + assert r.error is None + method, params = next(req for req in client.requests if req[0] == "turn/start") + assert method == "turn/start" + text = params["input"][0]["text"] + assert isinstance(text, str) + assert "[Image attached at: /tmp/a.png]" in text + assert "[image attached]" in text + def test_tool_iteration_counter_ticks(self): client = FakeClient() # Two completed exec items + one final agent message diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 35b13a65914..82107b7c05b 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -3350,6 +3350,8 @@ def _run_prompt_submit(rid, sid: str, session: dict, text: Any) -> None: _read_main_model(), _cfg, ) + if getattr(agent, "api_mode", "") == "codex_app_server": + _mode = "text" except Exception as _img_exc: print( f"[tui_gateway] image_routing decision failed, defaulting to text: {_img_exc}",