diff --git a/agent/transports/codex_app_server.py b/agent/transports/codex_app_server.py new file mode 100644 index 00000000000..b1aeaa00786 --- /dev/null +++ b/agent/transports/codex_app_server.py @@ -0,0 +1,368 @@ +"""Codex app-server JSON-RPC client. + +Speaks the protocol documented in codex-rs/app-server/README.md (codex 0.125+). +Transport is newline-delimited JSON-RPC 2.0 over stdio: spawn `codex app-server`, +do an `initialize` handshake, then drive `thread/start` + `turn/start` and +consume streaming `item/*` notifications until `turn/completed`. + +This module is the wire-level speaker only. Higher-level concerns (event +projection into Hermes' display, approval bridging, transcript projection into +AIAgent.messages, plugin migration) live in sibling modules. + +Status: optional opt-in runtime gated behind `model.openai_runtime == +"codex_app_server"`. Hermes' default tool dispatch is unchanged when this +runtime is not selected. +""" + +from __future__ import annotations + +import json +import os +import queue +import subprocess +import threading +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Optional + +# Default minimum codex version we test against. The PR sets this from the +# `codex --version` parsed at install time; bumping is a one-line change here. +MIN_CODEX_VERSION = (0, 125, 0) + + +@dataclass +class CodexAppServerError(RuntimeError): + """Raised on JSON-RPC errors from the app-server.""" + + code: int + message: str + data: Optional[Any] = None + + def __str__(self) -> str: # pragma: no cover - trivial + return f"codex app-server error {self.code}: {self.message}" + + +@dataclass +class _Pending: + queue: queue.Queue + method: str + sent_at: float = field(default_factory=time.time) + + +class CodexAppServerClient: + """Minimal JSON-RPC 2.0 client for `codex app-server` over stdio. + + Threading model: + - Spawning thread (caller) drives request/response pairs synchronously. + - One reader thread parses stdout, dispatches replies to the right + pending future, and routes notifications + server-initiated requests + to bounded queues that the caller drains on their own cadence. + - One reader thread captures stderr for diagnostics; codex emits + tracing logs there at RUST_LOG-controlled levels. + + Intentionally NOT async. AIAgent.run_conversation() is synchronous and + runs on the main thread; layering asyncio just to drive a stdio child + creates surprising interrupt semantics. We use blocking queues with + timeouts and rely on `turn/interrupt` for cancellation. + """ + + def __init__( + self, + codex_bin: str = "codex", + codex_home: Optional[str] = None, + extra_args: Optional[list[str]] = None, + env: Optional[dict[str, str]] = None, + ) -> None: + self._codex_bin = codex_bin + cmd = [codex_bin, "app-server"] + list(extra_args or []) + spawn_env = os.environ.copy() + if env: + spawn_env.update(env) + if codex_home: + spawn_env["CODEX_HOME"] = codex_home + # Codex emits tracing to stderr; default WARN keeps it quiet for users. + spawn_env.setdefault("RUST_LOG", "warn") + + self._proc = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=0, + env=spawn_env, + ) + self._next_id = 1 + self._pending: dict[int, _Pending] = {} + self._pending_lock = threading.Lock() + self._notifications: queue.Queue = queue.Queue() + self._server_requests: queue.Queue = queue.Queue() + self._stderr_lines: list[str] = [] + self._stderr_lock = threading.Lock() + self._closed = False + self._initialized = False + + self._reader = threading.Thread(target=self._read_stdout, daemon=True) + self._reader.start() + self._stderr_reader = threading.Thread(target=self._read_stderr, daemon=True) + self._stderr_reader.start() + + # ---------- lifecycle ---------- + + def initialize( + self, + client_name: str = "hermes", + client_title: str = "Hermes Agent", + client_version: str = "0.1", + capabilities: Optional[dict] = None, + timeout: float = 10.0, + ) -> dict: + """Send `initialize` + `initialized` handshake. Returns the server's + InitializeResponse (userAgent, codexHome, platformFamily, platformOs).""" + if self._initialized: + raise RuntimeError("already initialized") + params = { + "clientInfo": { + "name": client_name, + "title": client_title, + "version": client_version, + }, + "capabilities": capabilities or {}, + } + result = self.request("initialize", params, timeout=timeout) + self.notify("initialized") + self._initialized = True + return result + + def close(self, timeout: float = 3.0) -> None: + """Close stdin and wait for the subprocess to exit, escalating to kill.""" + if self._closed: + return + self._closed = True + try: + if self._proc.stdin and not self._proc.stdin.closed: + self._proc.stdin.close() + except Exception: + pass + try: + self._proc.terminate() + self._proc.wait(timeout=timeout) + except subprocess.TimeoutExpired: + try: + self._proc.kill() + self._proc.wait(timeout=1.0) + except Exception: + pass + + def __enter__(self) -> "CodexAppServerClient": + return self + + def __exit__(self, *exc: Any) -> None: + self.close() + + # ---------- send/receive ---------- + + def request( + self, + method: str, + params: Optional[dict] = None, + timeout: float = 30.0, + ) -> dict: + """Send a JSON-RPC request and block on the response. Returns `result`, + raises CodexAppServerError on `error`.""" + rid = self._take_id() + q: queue.Queue = queue.Queue(maxsize=1) + with self._pending_lock: + self._pending[rid] = _Pending(queue=q, method=method) + self._send({"id": rid, "method": method, "params": params or {}}) + try: + msg = q.get(timeout=timeout) + except queue.Empty: + with self._pending_lock: + self._pending.pop(rid, None) + raise TimeoutError( + f"codex app-server method {method!r} timed out after {timeout}s" + ) + if "error" in msg: + err = msg["error"] + raise CodexAppServerError( + code=err.get("code", -1), + message=err.get("message", ""), + data=err.get("data"), + ) + return msg.get("result", {}) + + def notify(self, method: str, params: Optional[dict] = None) -> None: + """Send a JSON-RPC notification (no id, no response expected).""" + self._send({"method": method, "params": params or {}}) + + def respond(self, request_id: Any, result: dict) -> None: + """Reply to a server-initiated request (e.g. approval prompts).""" + self._send({"id": request_id, "result": result}) + + def respond_error( + self, request_id: Any, code: int, message: str, data: Optional[Any] = None + ) -> None: + """Reply to a server-initiated request with an error.""" + err: dict[str, Any] = {"code": code, "message": message} + if data is not None: + err["data"] = data + self._send({"id": request_id, "error": err}) + + def take_notification(self, timeout: float = 0.0) -> Optional[dict]: + """Pop the next streaming notification, or return None on timeout. + + timeout=0.0 means non-blocking. Use small positive timeouts inside the + AIAgent turn loop to interleave reads with interrupt checks.""" + try: + if timeout <= 0: + return self._notifications.get_nowait() + return self._notifications.get(timeout=timeout) + except queue.Empty: + return None + + def take_server_request(self, timeout: float = 0.0) -> Optional[dict]: + """Pop the next server-initiated request (e.g. exec/applyPatch approval).""" + try: + if timeout <= 0: + return self._server_requests.get_nowait() + return self._server_requests.get(timeout=timeout) + except queue.Empty: + return None + + # ---------- diagnostics ---------- + + def stderr_tail(self, n: int = 20) -> list[str]: + """Return last n lines of codex's stderr (for error reports).""" + with self._stderr_lock: + return list(self._stderr_lines[-n:]) + + def is_alive(self) -> bool: + return self._proc.poll() is None + + # ---------- internals ---------- + + def _take_id(self) -> int: + # JSON-RPC ids only need to be unique per-connection. A simple + # monotonically increasing int is the common choice and matches what + # codex's own clients use. + rid = self._next_id + self._next_id += 1 + return rid + + def _send(self, obj: dict) -> None: + if self._closed: + raise RuntimeError("codex app-server client is closed") + if self._proc.stdin is None: + raise RuntimeError("codex app-server stdin not available") + try: + self._proc.stdin.write((json.dumps(obj) + "\n").encode("utf-8")) + self._proc.stdin.flush() + except (BrokenPipeError, ValueError) as exc: + raise RuntimeError( + f"codex app-server stdin closed unexpectedly: {exc}" + ) from exc + + def _read_stdout(self) -> None: + if self._proc.stdout is None: + return + try: + for line in iter(self._proc.stdout.readline, b""): + if not line: + break + line = line.strip() + if not line: + continue + try: + msg = json.loads(line) + except json.JSONDecodeError: + # Non-JSON output is unexpected on stdout; tracing belongs + # on stderr. Surface it via stderr buffer for diagnostics. + with self._stderr_lock: + self._stderr_lines.append( + f" {line[:200]!r}" + ) + continue + self._dispatch(msg) + except Exception as exc: + with self._stderr_lock: + self._stderr_lines.append(f" {exc}") + + def _dispatch(self, msg: dict) -> None: + # Reply (has id + result/error, no method) + if "id" in msg and ("result" in msg or "error" in msg): + with self._pending_lock: + pending = self._pending.pop(msg["id"], None) + if pending is not None: + try: + pending.queue.put_nowait(msg) + except queue.Full: # pragma: no cover - defensive + pass + return + # Server-initiated request (has id + method) + if "id" in msg and "method" in msg: + self._server_requests.put(msg) + return + # Notification (no id) + if "method" in msg: + self._notifications.put(msg) + + def _read_stderr(self) -> None: + if self._proc.stderr is None: + return + try: + for line in iter(self._proc.stderr.readline, b""): + if not line: + break + with self._stderr_lock: + self._stderr_lines.append( + line.decode("utf-8", "replace").rstrip() + ) + # Bound memory: keep last 500 lines. + if len(self._stderr_lines) > 500: + self._stderr_lines = self._stderr_lines[-500:] + except Exception: # pragma: no cover + pass + + +def parse_codex_version(output: str) -> Optional[tuple[int, int, int]]: + """Parse `codex --version` output. Returns (major, minor, patch) or None.""" + # Output format: "codex-cli 0.130.0" possibly followed by metadata. + import re + + match = re.search(r"(\d+)\.(\d+)\.(\d+)", output or "") + if not match: + return None + return (int(match.group(1)), int(match.group(2)), int(match.group(3))) + + +def check_codex_binary( + codex_bin: str = "codex", min_version: tuple[int, int, int] = MIN_CODEX_VERSION +) -> tuple[bool, str]: + """Verify codex CLI is installed and meets minimum version. + + Returns (ok, message). Used by setup wizard and runtime startup.""" + try: + proc = subprocess.run( + [codex_bin, "--version"], + capture_output=True, + text=True, + timeout=10, + ) + except FileNotFoundError: + return False, ( + f"codex CLI not found at {codex_bin!r}. Install with: " + f"npm i -g @openai/codex" + ) + except subprocess.TimeoutExpired: + return False, "codex --version timed out" + if proc.returncode != 0: + return False, f"codex --version exited {proc.returncode}: {proc.stderr.strip()}" + version = parse_codex_version(proc.stdout) + if version is None: + return False, f"could not parse codex version from: {proc.stdout!r}" + if version < min_version: + return False, ( + f"codex {'.'.join(map(str, version))} is older than required " + f"{'.'.join(map(str, min_version))}. Run: npm i -g @openai/codex" + ) + return True, ".".join(map(str, version)) diff --git a/agent/transports/codex_app_server_session.py b/agent/transports/codex_app_server_session.py new file mode 100644 index 00000000000..619cfeabfc1 --- /dev/null +++ b/agent/transports/codex_app_server_session.py @@ -0,0 +1,525 @@ +"""Session adapter for codex app-server runtime. + +Owns one Codex thread per Hermes session. Drives `turn/start`, consumes +streaming notifications via CodexEventProjector, handles server-initiated +approval requests (apply_patch, exec command), translates cancellation, +and returns a clean turn result that AIAgent.run_conversation() can splice +into its `messages` list. + +Lifecycle: + session = CodexAppServerSession(cwd="/home/x/proj") + session.ensure_started() # spawns + handshake + thread/start + result = session.run_turn(user_input="hello") # blocks until turn/completed + # result.final_text → assistant text returned to caller + # result.projected_messages → list of {role, content, ...} for messages list + # result.tool_iterations → how many tool-shaped items completed (skill nudge counter) + # result.interrupted → True if Ctrl+C / interrupt_requested fired mid-turn + session.close() # tears down subprocess + +Threading model: the adapter is single-threaded from the caller's perspective. +The underlying CodexAppServerClient owns its own reader threads but exposes +blocking-with-timeout queues that this adapter polls in a loop, so the run_turn +call is synchronous and behaves like AIAgent's existing chat_completions loop. +""" + +from __future__ import annotations + +import logging +import os +import threading +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Optional + +from agent.transports.codex_app_server import ( + CodexAppServerClient, + CodexAppServerError, +) +from agent.transports.codex_event_projector import CodexEventProjector + +logger = logging.getLogger(__name__) + + +# Permission profile mapping mirrors the docstring in PR proposal: +# Hermes' tools.terminal.security_mode → Codex's permissions profile id. +# Defaults if config is missing → workspace-write (matches Codex's own default). +_HERMES_TO_CODEX_PERMISSION_PROFILE = { + "auto": "workspace-write", + "approval-required": "read-only-with-approval", + "unrestricted": "full-access", + # Backstop alias used by some skills/tests. + "yolo": "full-access", +} + + +@dataclass +class TurnResult: + """Result of one user→assistant→tool turn through the codex app-server.""" + + final_text: str = "" + projected_messages: list[dict] = field(default_factory=list) + tool_iterations: int = 0 + interrupted: bool = False + error: Optional[str] = None # Set if turn ended in a non-recoverable error + turn_id: Optional[str] = None + thread_id: Optional[str] = None + + +@dataclass +class _ServerRequestRouting: + """Default policies for codex-side approval requests when no interactive + callback is wired in. These are only used by tests + cron / non-interactive + contexts; the live CLI path passes an approval_callback that defers to + tools.approval.prompt_dangerous_approval().""" + + auto_approve_exec: bool = False + auto_approve_apply_patch: bool = False + + +class CodexAppServerSession: + """One Codex thread per Hermes session, lifetime owned by AIAgent. + + Not thread-safe — one caller drives it at a time, matching how AIAgent's + run_conversation() loop is structured today. The codex client itself can + handle interleaved reads/writes via its own threads, but the adapter's + state (projector, thread_id, turn counter) is owned by the caller thread. + """ + + def __init__( + self, + *, + cwd: Optional[str] = None, + codex_bin: str = "codex", + codex_home: Optional[str] = None, + permission_profile: Optional[str] = None, + approval_callback: Optional[Callable[..., str]] = None, + on_event: Optional[Callable[[dict], None]] = None, + request_routing: Optional[_ServerRequestRouting] = None, + client_factory: Optional[Callable[..., CodexAppServerClient]] = None, + ) -> None: + self._cwd = cwd or os.getcwd() + self._codex_bin = codex_bin + self._codex_home = codex_home + self._permission_profile = ( + permission_profile or _HERMES_TO_CODEX_PERMISSION_PROFILE.get( + os.environ.get("HERMES_TERMINAL_SECURITY_MODE", "auto"), + "workspace-write", + ) + ) + self._approval_callback = approval_callback + self._on_event = on_event # Display hook (kawaii spinner ticks etc.) + self._routing = request_routing or _ServerRequestRouting() + self._client_factory = client_factory or CodexAppServerClient + + self._client: Optional[CodexAppServerClient] = None + self._thread_id: Optional[str] = None + self._interrupt_event = threading.Event() + # Pending file-change items, keyed by item id. Populated on + # item/started for fileChange items; consumed by the approval + # bridge when codex sends item/fileChange/requestApproval. The + # approval params don't carry the changeset, so we cache here + # to surface a real summary in the approval prompt (quirk #4). + self._pending_file_changes: dict[str, str] = {} + self._closed = False + + # ---------- lifecycle ---------- + + def ensure_started(self) -> str: + """Spawn the subprocess, do the initialize handshake, and start a + thread. Returns the codex thread id. Idempotent — repeated calls + return the same thread id.""" + if self._thread_id is not None: + return self._thread_id + if self._client is None: + self._client = self._client_factory( + codex_bin=self._codex_bin, codex_home=self._codex_home + ) + self._client.initialize( + client_name="hermes", + client_title="Hermes Agent", + client_version=_get_hermes_version(), + ) + # Permission selection is intentionally NOT sent on thread/start. + # Two reasons (live-tested against codex 0.130.0): + # 1. `thread/start.permissions` is gated behind the experimentalApi + # capability on this codex version — we'd have to opt in during + # initialize and accept the unstable surface. + # 2. Even with experimentalApi declared and the correct shape + # (`{"type": "profile", "id": "..."}`, not `{"profileId": ...}`), + # codex requires a matching `[permissions]` table in + # ~/.codex/config.toml or it fails the request with + # 'default_permissions requires a [permissions] table'. + # Letting codex pick its default (`:read-only` unless the user has + # configured otherwise in their codex config.toml) is the standard + # codex CLI workflow and avoids fighting codex's own validation. + # Users who want a write-capable profile configure it in their + # ~/.codex/config.toml the same way they would for any codex usage. + params: dict[str, Any] = {"cwd": self._cwd} + result = self._client.request("thread/start", params, timeout=15) + self._thread_id = result["thread"]["id"] + logger.info( + "codex app-server thread started: id=%s profile=%s cwd=%s", + self._thread_id[:8], + self._permission_profile, + self._cwd, + ) + return self._thread_id + + def close(self) -> None: + if self._closed: + return + self._closed = True + if self._client is not None: + try: + self._client.close() + except Exception: # pragma: no cover - best-effort cleanup + pass + self._client = None + self._thread_id = None + + def __enter__(self) -> "CodexAppServerSession": + return self + + def __exit__(self, *exc: Any) -> None: + self.close() + + # ---------- interrupt ---------- + + def request_interrupt(self) -> None: + """Idempotent: signal the active turn loop to issue turn/interrupt + and unwind. Called by AIAgent's _interrupt_requested path.""" + self._interrupt_event.set() + + # ---------- per-turn ---------- + + def run_turn( + self, + user_input: str, + *, + turn_timeout: float = 600.0, + notification_poll_timeout: float = 0.25, + ) -> TurnResult: + """Send a user message and block until turn/completed, while + forwarding server-initiated approval requests and projecting items + into Hermes' messages shape.""" + self.ensure_started() + assert self._client is not None and self._thread_id is not None + + self._interrupt_event.clear() + projector = CodexEventProjector() + result = TurnResult(thread_id=self._thread_id) + + # Send turn/start with the user input. Text-only for now (codex + # supports rich content but Hermes' text path is the common case). + try: + ts = self._client.request( + "turn/start", + { + "threadId": self._thread_id, + "input": [{"type": "text", "text": user_input}], + }, + timeout=10, + ) + except CodexAppServerError as exc: + result.error = f"turn/start failed: {exc}" + return result + + result.turn_id = (ts.get("turn") or {}).get("id") + deadline = time.time() + turn_timeout + turn_complete = False + + while time.time() < deadline and not turn_complete: + if self._interrupt_event.is_set(): + self._issue_interrupt(result.turn_id) + result.interrupted = True + break + + # Drain any server-initiated requests (approvals) before + # reading notifications, so the codex side isn't blocked. + sreq = self._client.take_server_request(timeout=0) + if sreq is not None: + # Drain any pending notifications first so per-turn state + # (e.g. _pending_file_changes for fileChange approvals) is + # up to date when we make the approval decision. Bounded + # to avoid starving the server-request response. + for _ in range(8): + pending = self._client.take_notification(timeout=0) + if pending is None: + break + self._track_pending_file_change(pending) + proj = projector.project(pending) + if proj.messages: + result.projected_messages.extend(proj.messages) + if proj.is_tool_iteration: + result.tool_iterations += 1 + if proj.final_text is not None: + result.final_text = proj.final_text + self._handle_server_request(sreq) + continue + + note = self._client.take_notification( + timeout=notification_poll_timeout + ) + if note is None: + continue + + method = note.get("method", "") + if self._on_event is not None: + try: + self._on_event(note) + except Exception: # pragma: no cover - display callback + logger.debug("on_event callback raised", exc_info=True) + + # Track in-progress fileChange items so the approval bridge + # can surface a real change summary when codex requests + # approval (the approval params themselves don't carry the + # changeset). Quirk #4 fix. + self._track_pending_file_change(note) + + # Project into messages + projection = projector.project(note) + if projection.messages: + result.projected_messages.extend(projection.messages) + if projection.is_tool_iteration: + result.tool_iterations += 1 + if projection.final_text is not None: + # Codex can emit multiple agentMessage items in one turn + # (e.g. partial then final). Take the last one as canonical. + result.final_text = projection.final_text + + if method == "turn/completed": + turn_complete = True + turn_status = ( + (note.get("params") or {}).get("turn") or {} + ).get("status") + if turn_status and turn_status not in ("completed", "interrupted"): + err_obj = ( + (note.get("params") or {}).get("turn") or {} + ).get("error") + if err_obj: + result.error = ( + f"turn ended status={turn_status}: " + f"{err_obj.get('message') or err_obj}" + ) + + if not turn_complete and not result.interrupted: + # Hit the deadline. Issue interrupt to stop wasted compute. + self._issue_interrupt(result.turn_id) + result.interrupted = True + result.error = result.error or f"turn timed out after {turn_timeout}s" + + return result + + # ---------- internals ---------- + + def _issue_interrupt(self, turn_id: Optional[str]) -> None: + if self._client is None or self._thread_id is None or turn_id is None: + return + try: + self._client.request( + "turn/interrupt", + {"threadId": self._thread_id, "turnId": turn_id}, + timeout=5, + ) + except CodexAppServerError as exc: + # "no active turn to interrupt" is fine — already done. + logger.debug("turn/interrupt non-fatal: %s", exc) + except TimeoutError: + logger.warning("turn/interrupt timed out") + + def _handle_server_request(self, req: dict) -> None: + """Translate a codex server request (approval) into Hermes' approval + flow, then send the response. + + Method names verified live against codex 0.130.0 (Apr 2026): + item/commandExecution/requestApproval — exec approvals + item/fileChange/requestApproval — apply_patch approvals + item/permissions/requestApproval — permissions changes + (we decline; user controls + permission profile in + ~/.codex/config.toml). + """ + if self._client is None: + return + method = req.get("method", "") + rid = req.get("id") + params = req.get("params") or {} + + if method == "item/commandExecution/requestApproval": + decision = self._decide_exec_approval(params) + self._client.respond(rid, {"decision": decision}) + elif method == "item/fileChange/requestApproval": + decision = self._decide_apply_patch_approval(params) + self._client.respond(rid, {"decision": decision}) + elif method == "item/permissions/requestApproval": + # Codex sometimes asks to escalate permissions mid-turn. We + # always decline — the user already chose their permission + # profile in ~/.codex/config.toml and surprise escalations + # shouldn't be silently accepted. + self._client.respond(rid, {"decision": "decline"}) + elif method == "mcpServer/elicitation/request": + # Codex's MCP layer asks the user for structured input on + # behalf of an MCP server (e.g. tool-call confirmation, + # OAuth, form data). For our own hermes-tools callback we + # auto-accept — the user already approved Hermes' tools + # by enabling the runtime, and we never expose anything + # codex's built-in shell can't already do. For other MCP + # servers we decline so the user explicitly opts in via + # codex's own auth flow. + server_name = params.get("serverName") or "" + if server_name == "hermes-tools": + self._client.respond( + rid, + {"action": "accept", "content": None, "_meta": None}, + ) + else: + self._client.respond( + rid, + {"action": "decline", "content": None, "_meta": None}, + ) + else: + # Unknown server request — codex can extend this surface. Reject + # cleanly so codex doesn't hang waiting for us. + logger.warning("Unknown codex server request: %s", method) + self._client.respond_error( + rid, code=-32601, message=f"Unsupported method: {method}" + ) + + def _decide_exec_approval(self, params: dict) -> str: + if self._routing.auto_approve_exec: + return "accept" + command = params.get("command") or "" + # Codex's CommandExecutionRequestApprovalParams has cwd as Optional — + # fall back to the session's cwd when codex doesn't include it so the + # approval prompt is never empty (quirk #10 fix). + cwd = params.get("cwd") or self._cwd or "" + reason = params.get("reason") + description = f"Codex requests exec in {cwd}" + if reason: + description += f" — {reason}" + if self._approval_callback is not None: + try: + choice = self._approval_callback( + command, description, allow_permanent=False + ) + return _approval_choice_to_codex_decision(choice) + except Exception: + logger.exception("approval_callback raised on exec request") + return "decline" + return "decline" # fail-closed when no callback wired + + def _decide_apply_patch_approval(self, params: dict) -> str: + if self._routing.auto_approve_apply_patch: + return "accept" + if self._approval_callback is not None: + # FileChangeRequestApprovalParams gives us reason + grantRoot. + # The actual changeset lives on the corresponding fileChange + # item which the projector has already cached for us — look it + # up by item_id so the user sees what's actually changing. + reason = params.get("reason") + grant_root = params.get("grantRoot") + item_id = params.get("itemId") or "" + change_summary = self._lookup_pending_file_change(item_id) + description_parts = [] + if reason: + description_parts.append(reason) + if change_summary: + description_parts.append(change_summary) + if grant_root: + description_parts.append(f"grants write to {grant_root}") + description = ( + "; ".join(description_parts) + if description_parts + else "Codex requests to apply a patch" + ) + command_label = ( + f"apply_patch: {change_summary}" if change_summary + else f"apply_patch: {reason}" if reason + else "apply_patch" + ) + try: + choice = self._approval_callback( + command_label, + description, + allow_permanent=False, + ) + return _approval_choice_to_codex_decision(choice) + except Exception: + logger.exception("approval_callback raised on apply_patch") + return "decline" + return "decline" + + def _track_pending_file_change(self, note: dict) -> None: + """Maintain self._pending_file_changes from item/started + item/completed + notifications. Lets the apply_patch approval prompt show what's + actually changing — codex's approval params don't carry the data.""" + method = note.get("method", "") + params = note.get("params") or {} + item = params.get("item") or {} + if item.get("type") != "fileChange": + return + item_id = item.get("id") or "" + if not item_id: + return + if method == "item/started": + changes = item.get("changes") or [] + if not changes: + self._pending_file_changes[item_id] = "1 change pending" + return + kinds: dict[str, int] = {} + paths: list[str] = [] + for ch in changes: + if not isinstance(ch, dict): + continue + kind = (ch.get("kind") or {}).get("type") or "update" + kinds[kind] = kinds.get(kind, 0) + 1 + p = ch.get("path") or "" + if p: + paths.append(p) + counts = ", ".join(f"{n} {k}" for k, n in sorted(kinds.items())) + preview = ", ".join(paths[:3]) + if len(paths) > 3: + preview += f", +{len(paths) - 3} more" + self._pending_file_changes[item_id] = ( + f"{counts}: {preview}" if preview else counts + ) + elif method == "item/completed": + self._pending_file_changes.pop(item_id, None) + + def _lookup_pending_file_change(self, item_id: str) -> Optional[str]: + """Look up an in-progress fileChange item by id and summarize its + changes for the approval prompt. Returns None when we don't have + the item cached (e.g. approval arrived before item/started, or + fileChange item content not tracked yet).""" + if not item_id: + return None + cached = self._pending_file_changes.get(item_id) + if not cached: + return None + return cached + + +def _approval_choice_to_codex_decision(choice: str) -> str: + """Map Hermes approval choices onto codex's CommandExecutionApprovalDecision + / FileChangeApprovalDecision wire values. + + Hermes returns 'once', 'session', 'always', or 'deny'. + Codex expects 'accept', 'acceptForSession', 'decline', or 'cancel' + (verified against codex-rs/app-server-protocol/src/protocol/v2/item.rs + on codex 0.130.0). + """ + if choice in ("once",): + return "accept" + if choice in ("session", "always"): + return "acceptForSession" + return "decline" + + +def _get_hermes_version() -> str: + """Best-effort Hermes version string for codex's userAgent line.""" + try: + from importlib.metadata import version + + return version("hermes-agent") + except Exception: # pragma: no cover + return "0.0.0" diff --git a/agent/transports/codex_event_projector.py b/agent/transports/codex_event_projector.py new file mode 100644 index 00000000000..0a388a60cfb --- /dev/null +++ b/agent/transports/codex_event_projector.py @@ -0,0 +1,312 @@ +"""Projects codex app-server events into Hermes' messages list. + +The translator that lets Hermes' memory/skill review keep working under the +Codex runtime: it converts Codex `item/*` notifications into the standard +OpenAI-shaped `{role, content, tool_calls, tool_call_id}` entries that +`agent/curator.py` already knows how to read. + +Codex emits items with a discriminator field `type`: + - userMessage → {role: "user", content} + - agentMessage → {role: "assistant", content} + - reasoning → stashed in the assistant's "reasoning" field + - commandExecution → assistant tool_call(name="exec") + tool result + - fileChange → assistant tool_call(name="apply_patch") + tool result + - mcpToolCall → assistant tool_call(name=f"mcp.{server}.{tool}") + tool result + - dynamicToolCall → assistant tool_call(name=tool) + tool result + - plan/hookPrompt/collabAgentToolCall → recorded as opaque assistant notes + +Each item maps to AT MOST one assistant entry + one tool entry, preserving +Hermes' message-alternation invariants (system → user → assistant → user/tool +→ assistant → ...). Multiple Codex tool calls within one Codex turn produce +multiple consecutive (assistant, tool) pairs, which is the same shape Hermes +already produces for parallel tool calls. + +Counters tracked alongside projection: + - tool_iterations: ticks once per completed tool-shaped item. Used by + AIAgent._iters_since_skill (skill nudge gate, default threshold 10). +""" + +from __future__ import annotations + +import hashlib +import json +from dataclasses import dataclass, field +from typing import Any, Optional + + +def _deterministic_call_id(item_type: str, item_id: str) -> str: + """Stable id for tool_call message correlation. + + Uses the codex item id directly when present (already a uuid); falls back + to a content hash so replay produces the same id across sessions and + prefix caches stay valid. See AGENTS.md Pitfall #16 (deterministic IDs in + tool call history).""" + if item_id: + return f"codex_{item_type}_{item_id}" + digest = hashlib.sha256(f"{item_type}".encode()).hexdigest()[:16] + return f"codex_{item_type}_{digest}" + + +def _format_tool_args(d: dict) -> str: + """Format a dict as JSON the way Hermes' existing tool_calls path does.""" + return json.dumps(d, ensure_ascii=False, sort_keys=True) + + +@dataclass +class ProjectionResult: + """Output of projecting one Codex item. + + `messages` is a list because some Codex items produce two messages + (assistant tool_call + tool result). Empty list = item ignored (e.g. a + streaming `outputDelta` that doesn't materialize into messages until the + `item/completed` event).""" + + messages: list[dict] = field(default_factory=list) + is_tool_iteration: bool = False + final_text: Optional[str] = None # Set when an agentMessage completes + + +class CodexEventProjector: + """Stateful projector consuming Codex notifications in arrival order. + + Owns the in-progress reasoning content (codex emits reasoning as separate + items but Hermes stashes it on the next assistant message).""" + + def __init__(self) -> None: + self._pending_reasoning: list[str] = [] + + def project(self, notification: dict) -> ProjectionResult: + """Project a single notification. Idempotent for non-completion events; + only `item/completed` and `turn/completed` materialize messages.""" + method = notification.get("method", "") + params = notification.get("params", {}) or {} + + # We only materialize messages on `item/completed`. Streaming deltas + # (`item//outputDelta`, `item//delta`) are display-only and + # don't enter the messages list — same way Hermes already only writes + # the assistant message after the streaming completion event. + if method != "item/completed": + return ProjectionResult() + + item = params.get("item") or {} + item_type = item.get("type") or "" + item_id = item.get("id") or "" + + if item_type == "agentMessage": + return self._project_agent_message(item) + if item_type == "reasoning": + self._pending_reasoning.extend(item.get("summary") or []) + self._pending_reasoning.extend(item.get("content") or []) + return ProjectionResult() + if item_type == "commandExecution": + return self._project_command(item, item_id) + if item_type == "fileChange": + return self._project_file_change(item, item_id) + if item_type == "mcpToolCall": + return self._project_mcp_tool_call(item, item_id) + if item_type == "dynamicToolCall": + return self._project_dynamic_tool_call(item, item_id) + if item_type == "userMessage": + return self._project_user_message(item) + + # Unknown / rare items (plan, hookPrompt, collabAgentToolCall, etc.) + # — record as opaque assistant note so memory review can still see + # *something* happened, but don't fabricate tool_call structure. + return self._project_opaque(item, item_type) + + # ---------- per-type projections ---------- + + def _project_agent_message(self, item: dict) -> ProjectionResult: + text = item.get("text") or "" + msg: dict[str, Any] = {"role": "assistant", "content": text} + if self._pending_reasoning: + msg["reasoning"] = "\n".join(self._pending_reasoning) + self._pending_reasoning = [] + return ProjectionResult(messages=[msg], final_text=text) + + def _project_user_message(self, item: dict) -> ProjectionResult: + # codex's userMessage content is a list of UserInput variants. For + # projection purposes we flatten any text fragments and ignore + # non-text parts (images, etc.) — Hermes' messages store text only. + text_parts: list[str] = [] + for fragment in item.get("content") or []: + if isinstance(fragment, dict): + if fragment.get("type") == "text": + text_parts.append(fragment.get("text") or "") + elif "text" in fragment: + text_parts.append(str(fragment["text"])) + return ProjectionResult( + messages=[{"role": "user", "content": "\n".join(text_parts)}] + ) + + def _project_command(self, item: dict, item_id: str) -> ProjectionResult: + call_id = _deterministic_call_id("exec", item_id) + args = { + "command": item.get("command") or "", + "cwd": item.get("cwd") or "", + } + assistant_msg = { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": call_id, + "type": "function", + "function": { + "name": "exec_command", + "arguments": _format_tool_args(args), + }, + } + ], + } + if self._pending_reasoning: + assistant_msg["reasoning"] = "\n".join(self._pending_reasoning) + self._pending_reasoning = [] + output = item.get("aggregatedOutput") or "" + exit_code = item.get("exitCode") + if exit_code is not None and exit_code != 0: + output = f"[exit {exit_code}]\n{output}" + tool_msg = { + "role": "tool", + "tool_call_id": call_id, + "content": output, + } + return ProjectionResult( + messages=[assistant_msg, tool_msg], is_tool_iteration=True + ) + + def _project_file_change(self, item: dict, item_id: str) -> ProjectionResult: + call_id = _deterministic_call_id("apply_patch", item_id) + # Reduce the codex changes array to a digest the agent loop will + # find readable. We record per-file change kinds (Add/Update/Delete) + # without inlining full file contents — those can be huge. + changes_summary = [] + for change in item.get("changes") or []: + kind = (change.get("kind") or {}).get("type") or "update" + path = change.get("path") or "" + changes_summary.append({"kind": kind, "path": path}) + args = {"changes": changes_summary} + assistant_msg = { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": call_id, + "type": "function", + "function": { + "name": "apply_patch", + "arguments": _format_tool_args(args), + }, + } + ], + } + if self._pending_reasoning: + assistant_msg["reasoning"] = "\n".join(self._pending_reasoning) + self._pending_reasoning = [] + status = item.get("status") or "unknown" + n = len(changes_summary) + tool_msg = { + "role": "tool", + "tool_call_id": call_id, + "content": f"apply_patch status={status}, {n} change(s)", + } + return ProjectionResult( + messages=[assistant_msg, tool_msg], is_tool_iteration=True + ) + + def _project_mcp_tool_call(self, item: dict, item_id: str) -> ProjectionResult: + server = item.get("server") or "mcp" + tool = item.get("tool") or "unknown" + call_id = _deterministic_call_id(f"mcp_{server}_{tool}", item_id) + args = item.get("arguments") or {} + if not isinstance(args, dict): + args = {"arguments": args} + assistant_msg = { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": call_id, + "type": "function", + "function": { + "name": f"mcp.{server}.{tool}", + "arguments": _format_tool_args(args), + }, + } + ], + } + if self._pending_reasoning: + assistant_msg["reasoning"] = "\n".join(self._pending_reasoning) + self._pending_reasoning = [] + result = item.get("result") + error = item.get("error") + if error: + content = f"[error] {json.dumps(error, ensure_ascii=False)[:1000]}" + elif result is not None: + content = json.dumps(result, ensure_ascii=False)[:4000] + else: + content = "" + tool_msg = { + "role": "tool", + "tool_call_id": call_id, + "content": content, + } + return ProjectionResult( + messages=[assistant_msg, tool_msg], is_tool_iteration=True + ) + + def _project_dynamic_tool_call( + self, item: dict, item_id: str + ) -> ProjectionResult: + tool = item.get("tool") or "unknown" + call_id = _deterministic_call_id(f"dyn_{tool}", item_id) + args = item.get("arguments") or {} + if not isinstance(args, dict): + args = {"arguments": args} + assistant_msg = { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": call_id, + "type": "function", + "function": { + "name": tool, + "arguments": _format_tool_args(args), + }, + } + ], + } + if self._pending_reasoning: + assistant_msg["reasoning"] = "\n".join(self._pending_reasoning) + self._pending_reasoning = [] + content_items = item.get("contentItems") or [] + if isinstance(content_items, list) and content_items: + content = json.dumps(content_items, ensure_ascii=False)[:4000] + else: + success = item.get("success") + content = f"success={success}" + tool_msg = { + "role": "tool", + "tool_call_id": call_id, + "content": content, + } + return ProjectionResult( + messages=[assistant_msg, tool_msg], is_tool_iteration=True + ) + + def _project_opaque(self, item: dict, item_type: str) -> ProjectionResult: + # Record the existence of the item without inventing tool_calls. + # Memory review will see this and may or may not save anything. + try: + payload = json.dumps(item, ensure_ascii=False)[:1500] + except (TypeError, ValueError): + payload = repr(item)[:1500] + return ProjectionResult( + messages=[ + { + "role": "assistant", + "content": f"[codex {item_type}] {payload}", + } + ] + ) diff --git a/agent/transports/hermes_tools_mcp_server.py b/agent/transports/hermes_tools_mcp_server.py new file mode 100644 index 00000000000..f7f8ae24887 --- /dev/null +++ b/agent/transports/hermes_tools_mcp_server.py @@ -0,0 +1,225 @@ +"""Hermes-tools-as-MCP server for the codex_app_server runtime. + +When the user runs `openai/*` turns through the codex app-server, codex +owns the loop and builds its own tool list. By default, that means +Hermes' richer tool surface — web search, browser automation, +delegate_task subagents, vision analysis, persistent memory, skills, +cross-session search, image generation, TTS — is unreachable. + +This module exposes a curated subset of those Hermes tools to the +spawned codex subprocess via stdio MCP. Codex registers it as a normal +MCP server (per `~/.codex/config.toml [mcp_servers.hermes-tools]`) and +the user gets full Hermes capability inside a Codex turn. + +Scope (what we expose): + - web_search, web_extract — Firecrawl, no codex equivalent + - browser_navigate / _click / _type / — Camofox/Browserbase automation + _snapshot / _screenshot / _scroll / _back / _press / _vision + - delegate_task — Hermes subagents + - vision_analyze — image inspection by vision model + - image_generate — image generation + - memory — Hermes' persistent memory store + - skill_view, skills_list — Hermes' skill library + - session_search — cross-session search + - text_to_speech — TTS + +What we DO NOT expose (codex has equivalents): + - terminal / shell — codex's own shell tool + - read_file / write_file / patch — codex's apply_patch + shell + - search_files / process — codex's shell + - clarify, todo — codex's own UX + +Run with: python -m agent.transports.hermes_tools_mcp_server +Spawned by: CodexAppServerSession.ensure_started() when the runtime is + active and config opts in. +""" + +from __future__ import annotations + +import json +import logging +import os +import sys +from typing import Any, Optional + +logger = logging.getLogger(__name__) + + +# Tools we expose. Each name MUST match a registered Hermes tool that +# `model_tools.handle_function_call()` can dispatch. +# +# What we deliberately DO NOT expose: +# - terminal / shell / read_file / write_file / patch / search_files / +# process — codex's built-ins cover these and approval routes through +# codex's own UI. +# - delegate_task / memory / session_search / todo — these are +# `_AGENT_LOOP_TOOLS` in Hermes (model_tools.py:493). They require +# the running AIAgent context to dispatch (mid-loop state), so a +# stateless MCP callback can't drive them. Hermes' default runtime +# keeps these working; the codex_app_server runtime cannot. +EXPOSED_TOOLS: tuple[str, ...] = ( + "web_search", + "web_extract", + "browser_navigate", + "browser_click", + "browser_type", + "browser_press", + "browser_snapshot", + "browser_scroll", + "browser_back", + "browser_get_images", + "browser_console", + "browser_vision", + "vision_analyze", + "image_generate", + "skill_view", + "skills_list", + "text_to_speech", + # Kanban worker handoff tools — gated on HERMES_KANBAN_TASK env var + # (set by the kanban dispatcher when spawning a worker). Without these + # in the callback, a worker spawned with openai_runtime=codex_app_server + # could do the work but couldn't report completion back to the kernel, + # making it hang until timeout. Stateless dispatch — they just read + # the env var and write to ~/.hermes/kanban.db. + "kanban_complete", + "kanban_block", + "kanban_comment", + "kanban_heartbeat", + "kanban_show", + "kanban_list", + # NOTE: kanban_create / kanban_unblock / kanban_link are orchestrator- + # only — the kanban tool gates them on HERMES_KANBAN_TASK being unset. + # They're exposed here for orchestrator agents running on the codex + # runtime that need to dispatch new tasks. + "kanban_create", + "kanban_unblock", + "kanban_link", +) + + +def _build_server() -> Any: + """Create the FastMCP server with Hermes tools attached. Lazy imports + so the module can be imported without the mcp package installed + (we degrade to a clear error only when actually run).""" + try: + from mcp.server.fastmcp import FastMCP + except ImportError as exc: # pragma: no cover - install hint + raise ImportError( + f"hermes-tools MCP server requires the 'mcp' package: {exc}" + ) from exc + + # Discover Hermes tools so dispatch works. + from model_tools import ( + get_tool_definitions, + handle_function_call, + ) + + mcp = FastMCP( + "hermes-tools", + instructions=( + "Hermes Agent's tool surface, exposed for use inside a Codex " + "session. Use these for capabilities Codex's built-in toolset " + "doesn't cover: web search/extract, browser automation, " + "subagent delegation, vision, image generation, persistent " + "memory, skills, and cross-session search." + ), + ) + + # Pull authoritative Hermes tool schemas for the ones we expose, so + # MCP clients see the same parameter docs Hermes gives the model. + all_defs = { + td["function"]["name"]: td["function"] + for td in (get_tool_definitions(quiet_mode=True) or []) + if isinstance(td, dict) and td.get("type") == "function" + } + + exposed_count = 0 + + for name in EXPOSED_TOOLS: + spec = all_defs.get(name) + if spec is None: + logger.debug( + "skipping %s — not registered in this Hermes process", name + ) + continue + + description = spec.get("description") or f"Hermes {name} tool" + params_schema = spec.get("parameters") or {"type": "object", "properties": {}} + + # FastMCP wants a Python callable. Build a closure that takes the + # arguments dict, dispatches via handle_function_call, and returns + # the result string. We use add_tool() for full control over the + # input schema (FastMCP's @tool() decorator inspects type hints, + # which we can't get from a JSON schema at runtime). + def _make_handler(tool_name: str): + def _dispatch(**kwargs: Any) -> str: + try: + return handle_function_call(tool_name, kwargs or {}) + except Exception as exc: + logger.exception("tool %s raised", tool_name) + return json.dumps({"error": str(exc), "tool": tool_name}) + _dispatch.__name__ = tool_name + _dispatch.__doc__ = description + return _dispatch + + try: + mcp.add_tool( + _make_handler(name), + name=name, + description=description, + # FastMCP accepts JSON schema directly via the + # input_schema parameter on newer versions; older + # versions use parameters_schema. Try both for compat. + ) + except TypeError: + # Older mcp SDK signature — fall back to decorator-style. + handler = _make_handler(name) + handler = mcp.tool(name=name, description=description)(handler) + + exposed_count += 1 + + logger.info( + "hermes-tools MCP server registered %d/%d tools", + exposed_count, + len(EXPOSED_TOOLS), + ) + return mcp + + +def main(argv: Optional[list[str]] = None) -> int: + """Entry point for `python -m agent.transports.hermes_tools_mcp_server`.""" + argv = argv or sys.argv[1:] + verbose = "--verbose" in argv or "-v" in argv + + log_level = logging.INFO if verbose else logging.WARNING + logging.basicConfig( + level=log_level, + stream=sys.stderr, # MCP uses stdio for protocol — logs MUST go to stderr + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + ) + + # Quiet mode: keep Hermes' own banners off stdout (which is the MCP wire). + os.environ.setdefault("HERMES_QUIET", "1") + os.environ.setdefault("HERMES_REDACT_SECRETS", "true") + + try: + server = _build_server() + except ImportError as exc: + sys.stderr.write(f"hermes-tools MCP server cannot start: {exc}\n") + return 2 + + # FastMCP runs with stdio transport by default when launched as a + # subprocess. + try: + server.run() + except KeyboardInterrupt: + return 0 + except Exception as exc: + logger.exception("hermes-tools MCP server crashed") + sys.stderr.write(f"hermes-tools MCP server error: {exc}\n") + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/cli.py b/cli.py index 72ffd0b1708..5560846320d 100644 --- a/cli.py +++ b/cli.py @@ -6774,6 +6774,46 @@ class HermesCLI: else: _cprint(" (session only — add --global to persist)") + def _handle_codex_runtime(self, cmd_original: str) -> None: + """Handle /codex-runtime — toggle the codex app-server runtime opt-in. + + Usage: + /codex-runtime — show current state + /codex-runtime auto — Hermes default (chat_completions) + /codex-runtime codex_app_server — hand turns to codex subprocess + /codex-runtime on / off — synonyms for the above + """ + from hermes_cli import codex_runtime_switch as crs + + parts = cmd_original.split(None, 1) + raw_args = parts[1].strip() if len(parts) > 1 else "" + new_value, errors = crs.parse_args(raw_args) + if errors: + for err in errors: + _cprint(f"❌ {err}") + return + + # Load + persist via the existing config helpers + try: + from hermes_cli.config import load_config, save_config + except Exception as exc: + _cprint(f"❌ could not load config: {exc}") + return + cfg = load_config() + + result = crs.apply( + cfg, + new_value, + persist_callback=(save_config if new_value is not None else None), + ) + + prefix = "✓" if result.success else "✗" + for line in result.message.splitlines(): + _cprint(f" {prefix} {line}" if line.startswith("openai_runtime") + else f" {line}") + if result.success and result.requires_new_session: + _cprint(" Tip: `/reset` starts a new session immediately.") + def _should_handle_model_command_inline(self, text: str, has_images: bool = False) -> bool: """Return True when /model should be handled immediately on the UI thread.""" if not text or has_images or not _looks_like_slash_command(text): @@ -7454,6 +7494,8 @@ class HermesCLI: self._handle_resume_command(cmd_original) elif canonical == "model": self._handle_model_switch(cmd_original) + elif canonical == "codex-runtime": + self._handle_codex_runtime(cmd_original) elif canonical == "gquota": self._handle_gquota_command(cmd_original) diff --git a/gateway/run.py b/gateway/run.py index 4946a7e6c1e..95f1d811543 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -6128,6 +6128,12 @@ class GatewayRunner: if _cmd_def_inner and _cmd_def_inner.name == "model": return "Agent is running — wait or /stop first, then switch models." + # /codex-runtime must not be used while the agent is running. + # Switching mid-turn would split a turn across two transports. + if _cmd_def_inner and _cmd_def_inner.name == "codex-runtime": + return ("Agent is running — wait or /stop first, then " + "change runtime.") + # /approve and /deny must bypass the running-agent interrupt path. # The agent thread is blocked on a threading.Event inside # tools/approval.py — sending an interrupt won't unblock it. @@ -6462,6 +6468,9 @@ class GatewayRunner: if canonical == "model": return await self._handle_model_command(event) + if canonical == "codex-runtime": + return await self._handle_codex_runtime_command(event) + if canonical == "personality": return await self._handle_personality_command(event) @@ -9242,6 +9251,51 @@ class GatewayRunner: return "\n".join(lines) + async def _handle_codex_runtime_command(self, event: MessageEvent) -> str: + """Handle /codex-runtime command in the gateway. + + Same surface as the CLI handler in cli.py: + /codex-runtime — show current state + /codex-runtime auto — Hermes default runtime + /codex-runtime codex_app_server — codex subprocess runtime + /codex-runtime on / off — synonyms + + On change, the cached agent for this session is evicted so the next + message creates a fresh AIAgent with the new api_mode wired in + (avoids prompt-cache invalidation mid-session).""" + from hermes_cli import codex_runtime_switch as crs + + raw_args = event.get_command_args().strip() if event else "" + new_value, errors = crs.parse_args(raw_args) + if errors: + return "❌ " + "\n❌ ".join(errors) + + # Load + persist via the same helpers used for /model and /yolo + try: + from hermes_cli.config import load_config, save_config + except Exception as exc: + return f"❌ Could not load config: {exc}" + cfg = load_config() + + result = crs.apply( + cfg, + new_value, + persist_callback=(save_config if new_value is not None else None), + ) + + # On a real change, evict the cached agent so the new runtime takes + # effect on the next message rather than waiting for cache TTL. + if result.success and new_value is not None and result.requires_new_session: + try: + session_key = self._session_key_for_source(event.source) + self._evict_cached_agent(session_key) + except Exception: + logger.debug("could not evict cached agent after codex-runtime change", + exc_info=True) + + prefix = "✓" if result.success else "✗" + return f"{prefix} {result.message}" + async def _handle_personality_command(self, event: MessageEvent) -> str: """Handle /personality command - list or set a personality.""" from hermes_constants import display_hermes_home diff --git a/hermes_cli/banner.py b/hermes_cli/banner.py index 1cfb0d51f76..c4ec348ef48 100644 --- a/hermes_cli/banner.py +++ b/hermes_cli/banner.py @@ -581,6 +581,19 @@ def build_welcome_banner(console: Console, model: str, cwd: str, if mcp_connected: summary_parts.append(f"{mcp_connected} MCP servers") summary_parts.append("/help for commands") + # Indicate when the codex_app_server runtime is active so users + # understand why tool counts may not match what's actually reachable + # (codex builds its own tool list inside the spawned subprocess). + try: + from hermes_cli.codex_runtime_switch import get_current_runtime + from hermes_cli.config import load_config as _load_cfg + if get_current_runtime(_load_cfg()) == "codex_app_server": + right_lines.append( + f"[bold {accent}]Runtime:[/] [{text}]codex app-server[/] " + f"[dim {dim}](terminal/file ops/MCP run inside codex)[/]" + ) + except Exception: + pass # Show active profile name when not 'default' try: from hermes_cli.profiles import get_active_profile_name diff --git a/hermes_cli/codex_runtime_plugin_migration.py b/hermes_cli/codex_runtime_plugin_migration.py new file mode 100644 index 00000000000..c00ec26bd29 --- /dev/null +++ b/hermes_cli/codex_runtime_plugin_migration.py @@ -0,0 +1,598 @@ +"""Migrate Hermes' MCP server config and Codex's installed curated plugins +to the format Codex expects in ~/.codex/config.toml. + +When the user enables the codex_app_server runtime, the codex subprocess +runs its own MCP client and its own plugin runtime (Linear, Atlassian, +Asana, plus per-account ChatGPT apps via app/list). For both of those to +be useful, the user's choices need to be visible to codex too. This +module: + + 1. Reads Hermes' YAML and writes equivalent [mcp_servers.] + entries to ~/.codex/config.toml. + 2. Queries codex's `plugin/list` for the openai-curated marketplace + and writes [plugins."@"] entries for any plugin + the user has installed=true on their codex CLI. (This is what + OpenClaw calls "migrate native codex plugins" — the YouTube-video- + worthy bit Pash highlighted: Canva, GitHub, Calendar, Gmail + pre-configured.) + 3. Writes a [permissions] default profile so users on this runtime + don't get an approval prompt on every write attempt. + +What translates (MCP servers): + Hermes mcp_servers..command/args/env → codex stdio transport + Hermes mcp_servers..url/headers → codex streamable_http transport + Hermes mcp_servers..timeout → codex tool_timeout_sec + Hermes mcp_servers..connect_timeout → codex startup_timeout_sec + +What does NOT translate (warned + skipped): + Hermes-specific keys (sampling, etc.) — codex's MCP client has no + equivalent. Listed in the per-server skipped[] field of the report. + +What's NOT migrated (intentional): + AGENTS.md — codex respects this file natively in its cwd. Hermes' own + AGENTS.md (project-level) is already in the worktree, so codex picks + it up without translation. No code needed. +""" + +from __future__ import annotations + +import logging +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Optional + +logger = logging.getLogger(__name__) + + +# Marker comments wrapping the managed section so re-runs can detect +# what's ours and what's user-edited. Both must appear or strip is a no-op. +MIGRATION_MARKER = ( + "# managed by hermes-agent — `hermes codex-runtime migrate` regenerates this section" +) +MIGRATION_END_MARKER = ( + "# end hermes-agent managed section" +) + + +@dataclass +class MigrationReport: + """Outcome of a migration pass.""" + + target_path: Optional[Path] = None + migrated: list[str] = field(default_factory=list) + skipped_keys_per_server: dict[str, list[str]] = field(default_factory=dict) + migrated_plugins: list[str] = field(default_factory=list) + plugin_query_error: Optional[str] = None + wrote_permissions_default: Optional[str] = None + errors: list[str] = field(default_factory=list) + written: bool = False + dry_run: bool = False + + def summary(self) -> str: + lines = [] + if self.dry_run: + lines.append(f"(dry run) Would write {self.target_path}") + elif self.written: + lines.append(f"Wrote {self.target_path}") + if self.migrated: + lines.append(f"Migrated {len(self.migrated)} MCP server(s):") + for name in self.migrated: + skipped = self.skipped_keys_per_server.get(name, []) + note = ( + f" (skipped: {', '.join(skipped)})" if skipped else "" + ) + lines.append(f" - {name}{note}") + else: + lines.append("No MCP servers found in Hermes config.") + if self.migrated_plugins: + lines.append( + f"Migrated {len(self.migrated_plugins)} native Codex plugin(s):" + ) + for name in self.migrated_plugins: + lines.append(f" - {name}") + elif self.plugin_query_error: + lines.append(f"Codex plugin discovery skipped: {self.plugin_query_error}") + if self.wrote_permissions_default: + lines.append( + f"Wrote default_permissions = " + f"{self.wrote_permissions_default!r}" + ) + for err in self.errors: + lines.append(f"⚠ {err}") + return "\n".join(lines) + + +# Hermes keys that codex's MCP schema doesn't support — dropped during +# migration with a warning. Anything not on the keep list AND not the +# transport keys is added to skipped. +_KNOWN_HERMES_KEYS = { + # transport — stdio + "command", "args", "env", "cwd", + # transport — http + "url", "headers", "transport", + # timeouts + "timeout", "connect_timeout", + # general + "enabled", "description", +} + +# Subset that have a direct codex equivalent. +_KEYS_DROPPED_WITH_WARNING = { + # Hermes' sampling subsection — codex MCP has no equivalent + "sampling", +} + + +def _translate_one_server( + name: str, hermes_cfg: dict +) -> tuple[Optional[dict], list[str]]: + """Translate one Hermes MCP server config to the codex inline-table dict + representation. Returns (codex_entry, skipped_keys). + + codex_entry is a dict ready for TOML serialization, or None when the + server can't be translated (e.g. neither command nor url present).""" + if not isinstance(hermes_cfg, dict): + return None, [] + + skipped: list[str] = [] + out: dict[str, Any] = {} + + has_command = bool(hermes_cfg.get("command")) + has_url = bool(hermes_cfg.get("url")) + + if has_command and has_url: + skipped.append("url (both command and url set; preferring stdio)") + has_url = False + + if has_command: + # Stdio transport + out["command"] = str(hermes_cfg["command"]) + args = hermes_cfg.get("args") or [] + if args: + out["args"] = [str(a) for a in args] + env = hermes_cfg.get("env") or {} + if env: + # Codex expects string values + out["env"] = {str(k): str(v) for k, v in env.items()} + cwd = hermes_cfg.get("cwd") + if cwd: + out["cwd"] = str(cwd) + elif has_url: + # streamable_http transport (codex covers both http and SSE here) + out["url"] = str(hermes_cfg["url"]) + headers = hermes_cfg.get("headers") or {} + if headers: + out["http_headers"] = {str(k): str(v) for k, v in headers.items()} + # Hermes' transport: sse hint is informational; codex auto-negotiates + if hermes_cfg.get("transport") == "sse": + skipped.append("transport=sse (codex auto-negotiates)") + else: + return None, ["no command or url field"] + + # Timeouts + if "timeout" in hermes_cfg: + try: + out["tool_timeout_sec"] = float(hermes_cfg["timeout"]) + except (TypeError, ValueError): + skipped.append("timeout (not numeric)") + if "connect_timeout" in hermes_cfg: + try: + out["startup_timeout_sec"] = float(hermes_cfg["connect_timeout"]) + except (TypeError, ValueError): + skipped.append("connect_timeout (not numeric)") + + # Enabled flag (codex defaults to true so we only emit when explicitly false) + if hermes_cfg.get("enabled") is False: + out["enabled"] = False + + # Detect keys we explicitly drop with warning + for key in hermes_cfg: + if key in _KEYS_DROPPED_WITH_WARNING: + skipped.append(f"{key} (no codex equivalent)") + elif key not in _KNOWN_HERMES_KEYS: + skipped.append(f"{key} (unknown Hermes key)") + + return out, skipped + + +def _format_toml_value(value: Any) -> str: + """Minimal TOML value formatter for the value types we emit. + + We only emit strings, numbers, booleans, and tables of those — no nested + arrays of tables. This covers everything codex's MCP schema accepts.""" + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, (int, float)): + return repr(value) + if isinstance(value, str): + # Escape per TOML basic-string rules. Order matters: backslash + # first so the other escapes don't get re-escaped. + # Control characters (newline, tab, etc.) must use \-escapes + # because TOML basic strings don't allow literal control chars + # — passing them through would produce invalid TOML that codex + # would refuse to load. Paths usually don't contain control + # chars but env-var passthrough (HERMES_HOME, PYTHONPATH) could + # in pathological cases. + escaped = ( + value + .replace("\\", "\\\\") + .replace('"', '\\"') + .replace("\b", "\\b") + .replace("\t", "\\t") + .replace("\n", "\\n") + .replace("\f", "\\f") + .replace("\r", "\\r") + ) + return f'"{escaped}"' + if isinstance(value, list): + items = ", ".join(_format_toml_value(v) for v in value) + return f"[{items}]" + if isinstance(value, dict): + items = ", ".join( + f'{_quote_key(k)} = {_format_toml_value(v)}' for k, v in value.items() + ) + return "{ " + items + " }" if items else "{}" + raise ValueError(f"Unsupported TOML value type: {type(value).__name__}") + + +def _quote_key(key: str) -> str: + """Return key bare-or-quoted depending on whether it's a valid bare key.""" + if all(c.isalnum() or c in "-_" for c in key) and key: + return key + escaped = key.replace("\\", "\\\\").replace('"', '\\"') + return f'"{escaped}"' + +def render_codex_toml_section( + servers: dict[str, dict], + plugins: Optional[list[dict]] = None, + default_permission_profile: Optional[str] = None, +) -> str: + """Render the managed [mcp_servers.] / [plugins.] / [permissions] + block for ~/.codex/config.toml. + + Args: + servers: dict of MCP server name → translated codex inline-table + plugins: optional list of {name, marketplace, enabled} for native + Codex plugins to enable. (E.g. the Linear / Atlassian / Asana + curated plugins, or per-account ChatGPT apps.) + default_permission_profile: when set, write `[permissions] default` + so the user doesn't get an approval prompt on every write + attempt. Common values: "workspace-write", "read-only", + "full-access". + """ + out = [MIGRATION_MARKER] + if not servers and not plugins and not default_permission_profile: + out.append("# (no MCP servers, plugins, or permissions configured by Hermes)") + out.append(MIGRATION_END_MARKER) + return "\n".join(out) + "\n" + + if default_permission_profile: + # Codex's config schema: `default_permissions` is a top-level + # string referencing a profile name. Built-in profile names start + # with ":" (":workspace-write", ":read-only", ":full-access"). The + # [permissions] table is for *user-defined* named profiles with + # structured fields — not what we want. + normalized = ( + default_permission_profile + if default_permission_profile.startswith(":") + else f":{default_permission_profile}" + ) + out.append("") + out.append(f"default_permissions = {_format_toml_value(normalized)}") + + if servers: + for name in sorted(servers.keys()): + cfg = servers[name] + out.append("") + out.append(f"[mcp_servers.{_quote_key(name)}]") + for k, v in cfg.items(): + out.append(f"{_quote_key(k)} = {_format_toml_value(v)}") + + if plugins: + for plugin in sorted(plugins, key=lambda p: f"{p.get('name','')}@{p.get('marketplace','')}"): + name = plugin.get("name") or "" + marketplace = plugin.get("marketplace") or "openai-curated" + enabled = bool(plugin.get("enabled", True)) + qualified = f"{name}@{marketplace}" + out.append("") + out.append(f'[plugins.{_quote_key(qualified)}]') + out.append(f"enabled = {_format_toml_value(enabled)}") + + out.append("") + out.append(MIGRATION_END_MARKER) + return "\n".join(out) + "\n" + + +def _strip_existing_managed_block(toml_text: str) -> str: + """Remove any prior managed section so re-runs idempotently replace it. + + The managed section is everything between MIGRATION_MARKER (start) and + MIGRATION_END_MARKER (end), inclusive of both markers. User-edited + sections above or below are preserved verbatim. + + Backward compatibility: if the start marker is found but no end marker + follows, we fall back to the heuristic that swallows lines until we + hit a section that's not [mcp_servers.*]/[plugins.*]/[permissions]/ + a `default_permissions =` key. This matches what older versions of + this code wrote so re-runs don't break configs from prior Hermes + versions.""" + lines = toml_text.splitlines(keepends=True) + out: list[str] = [] + in_managed = False + saw_end_marker = False + for line in lines: + line_stripped_nl = line.rstrip("\n") + if line_stripped_nl == MIGRATION_MARKER: + in_managed = True + saw_end_marker = False + continue + if in_managed: + if line_stripped_nl == MIGRATION_END_MARKER: + in_managed = False + saw_end_marker = True + continue + stripped = line.lstrip() + if not saw_end_marker and stripped.startswith("[") and not ( + stripped.startswith("[mcp_servers") + or stripped.startswith("[plugins") + or stripped.startswith("[permissions]") + or stripped.startswith("[permissions.") + ): + # Old-format managed block without end marker: bail back + # to user content as soon as we see a non-managed section. + in_managed = False + out.append(line) + continue + # Otherwise swallow the line. + continue + out.append(line) + return "".join(out) + + +def _query_codex_plugins( + codex_home: Optional[Path] = None, + timeout: float = 8.0, +) -> tuple[list[dict], Optional[str]]: + """Query codex's `plugin/list` for installed curated plugins. + + Spawns `codex app-server` briefly, sends initialize + plugin/list, + extracts plugins where installed=true. Returns (plugins, error). + Plugins is a list of {name, marketplace, enabled} dicts ready for + render_codex_toml_section(). + + On any failure (codex not installed, RPC error, timeout) returns + ([], error_message). Migration treats this as non-fatal — MCP + servers and permissions still write through. + """ + try: + from agent.transports.codex_app_server import CodexAppServerClient + except Exception as exc: + return [], f"transport unavailable: {exc}" + + try: + with CodexAppServerClient( + codex_home=str(codex_home) if codex_home else None + ) as client: + client.initialize(client_name="hermes-migration") + resp = client.request("plugin/list", {}, timeout=timeout) + except Exception as exc: + return [], f"plugin/list query failed: {exc}" + + out: list[dict] = [] + seen: set[tuple[str, str]] = set() + marketplaces = resp.get("marketplaces") or [] + if not isinstance(marketplaces, list): + return [], "plugin/list response missing 'marketplaces'" + for marketplace in marketplaces: + if not isinstance(marketplace, dict): + continue + market_name = str(marketplace.get("name") or "openai-curated") + plugins = marketplace.get("plugins") or [] + if not isinstance(plugins, list): + continue + for plugin in plugins: + if not isinstance(plugin, dict): + continue + installed = bool(plugin.get("installed", False)) + if not installed: + continue + name = str(plugin.get("name") or "") + if not name: + continue + key = (name, market_name) + if key in seen: + continue + seen.add(key) + # Carry forward whatever 'enabled' codex reports — defaults to + # true for installed plugins. This is the same shape OpenClaw + # writes when migrating native codex plugins. + out.append({ + "name": name, + "marketplace": market_name, + "enabled": bool(plugin.get("enabled", True)), + }) + return out, None + + +def _build_hermes_tools_mcp_entry() -> dict: + """Build the codex stdio-transport entry that launches Hermes' own + tool surface as an MCP server. Codex's subprocess will call back into + this for browser/web/delegate_task/vision/memory/skills tools. + + The command runs the worktree's Python via the current sys.executable + so a hermes installed under /opt/, /usr/local/, or a venv all work. + HERMES_HOME and PYTHONPATH are passed through so the spawned process + sees the same config + module layout the user is running.""" + import sys + + env: dict[str, str] = {} + # HERMES_HOME passes through if set so the MCP subprocess sees the + # same config / auth / sessions DB as the parent CLI. + hermes_home = os.environ.get("HERMES_HOME") + if hermes_home: + env["HERMES_HOME"] = hermes_home + # PYTHONPATH passes through so a worktree-launched hermes finds the + # branch's modules instead of the installed package. + pythonpath = os.environ.get("PYTHONPATH") + if pythonpath: + env["PYTHONPATH"] = pythonpath + # Quiet mode + redaction defaults so the MCP wire stays clean. + env["HERMES_QUIET"] = "1" + env["HERMES_REDACT_SECRETS"] = env.get("HERMES_REDACT_SECRETS", "true") + + out: dict[str, Any] = { + "command": sys.executable, + "args": ["-m", "agent.transports.hermes_tools_mcp_server"], + } + if env: + out["env"] = env + # Generous timeouts — browser_navigate or delegate_task can take a + # while; we don't want codex's MCP client to give up too early. + out["startup_timeout_sec"] = 30.0 + out["tool_timeout_sec"] = 600.0 + return out + + +def migrate( + hermes_config: dict, + *, + codex_home: Optional[Path] = None, + dry_run: bool = False, + discover_plugins: bool = True, + default_permission_profile: Optional[str] = ":workspace", + expose_hermes_tools: bool = True, +) -> MigrationReport: + """Translate Hermes mcp_servers config + Codex curated plugins into + ~/.codex/config.toml. + + Args: + hermes_config: full ~/.hermes/config.yaml dict + codex_home: override CODEX_HOME (defaults to ~/.codex) + dry_run: skip the actual write; report what would happen + discover_plugins: when True (default), query `plugin/list` against + the live codex CLI to migrate any installed curated plugins + into [plugins."@"] entries. Set False to + skip the subprocess spawn (for tests or restricted environments). + default_permission_profile: when set (default ":workspace"), write + top-level `default_permissions = ""` so users on this + runtime don't get an approval prompt on every write attempt. + Built-in codex profile names are ":workspace", ":read-only", + ":danger-no-sandbox" (note the leading ":"). Also accepts a + user-defined profile name (no leading ":") that the user has + configured in their own [permissions.] table. Set None + to leave permissions unset and let codex use its compiled-in + default (which is read-only). + expose_hermes_tools: when True (default), register Hermes' own + tool surface (web_search, browser_*, delegate_task, vision, + memory, skills, etc.) as an MCP server in ~/.codex/config.toml + so the codex subprocess can call back into Hermes for tools + codex doesn't have built in. Set False to opt out. + """ + report = MigrationReport(dry_run=dry_run) + codex_home = codex_home or Path.home() / ".codex" + target = codex_home / "config.toml" + report.target_path = target + + hermes_servers = (hermes_config or {}).get("mcp_servers") or {} + if not isinstance(hermes_servers, dict): + report.errors.append( + "mcp_servers in Hermes config is not a dict; cannot migrate." + ) + return report + + translated: dict[str, dict] = {} + for name, cfg in hermes_servers.items(): + out, skipped = _translate_one_server(str(name), cfg or {}) + if out is None: + report.errors.append( + f"server {name!r} skipped: {', '.join(skipped) or 'no transport configured'}" + ) + continue + translated[str(name)] = out + if skipped: + report.skipped_keys_per_server[str(name)] = skipped + report.migrated.append(str(name)) + + # Discover installed Codex curated plugins. Best-effort — never blocks + # the migration if codex is unreachable or the RPC fails. + plugins: list[dict] = [] + if discover_plugins and not dry_run: + plugins, plugin_err = _query_codex_plugins(codex_home=codex_home) + if plugin_err: + report.plugin_query_error = plugin_err + for p in plugins: + report.migrated_plugins.append(f"{p['name']}@{p['marketplace']}") + + # Track whether we wrote a default permission profile so the report + # surfaces it to the user. + if default_permission_profile: + report.wrote_permissions_default = default_permission_profile + + # Inject Hermes' own tool surface as an MCP server so the spawned + # codex subprocess can call back into Hermes for the tools codex + # doesn't ship with — web_search, browser_*, delegate_task, vision, + # memory, skills, session_search, image_generate, text_to_speech. + # The server itself is agent/transports/hermes_tools_mcp_server.py + # and is launched on demand by codex (stdio MCP). + if expose_hermes_tools: + translated["hermes-tools"] = _build_hermes_tools_mcp_entry() + if "hermes-tools" not in report.migrated: + report.migrated.append("hermes-tools") + + # Build the new managed block + managed_block = render_codex_toml_section( + translated, plugins=plugins, + default_permission_profile=default_permission_profile, + ) + + # Read existing codex config if any, strip the prior managed block, + # append the new one. + if target.exists(): + try: + existing = target.read_text(encoding="utf-8") + except Exception as exc: + report.errors.append(f"could not read {target}: {exc}") + return report + without_managed = _strip_existing_managed_block(existing) + # Ensure exactly one blank line between user content and managed block + if without_managed and not without_managed.endswith("\n"): + without_managed += "\n" + new_text = ( + without_managed.rstrip("\n") + "\n\n" + managed_block + if without_managed.strip() + else managed_block + ) + else: + new_text = managed_block + + if dry_run: + return report + + try: + codex_home.mkdir(parents=True, exist_ok=True) + # Atomic write: write to a temp file in the same directory then + # rename. Same-directory rename is atomic on POSIX and ReplaceFile + # on Windows. Avoids leaving a half-written config.toml that + # codex would refuse to load if we crash mid-write. + import tempfile + tmp_fd, tmp_path_str = tempfile.mkstemp( + prefix=".config.toml.", dir=str(codex_home) + ) + tmp_path = Path(tmp_path_str) + try: + with os.fdopen(tmp_fd, "w", encoding="utf-8") as fh: + fh.write(new_text) + tmp_path.replace(target) + except Exception: + # Clean up the temp file if the rename didn't happen. + try: + if tmp_path.exists(): + tmp_path.unlink() + except Exception: + pass + raise + report.written = True + except Exception as exc: + report.errors.append(f"could not write {target}: {exc}") + return report diff --git a/hermes_cli/codex_runtime_switch.py b/hermes_cli/codex_runtime_switch.py new file mode 100644 index 00000000000..b3adda12b54 --- /dev/null +++ b/hermes_cli/codex_runtime_switch.py @@ -0,0 +1,266 @@ +"""Shared logic for the /codex-runtime slash command. + +Toggles `model.openai_runtime` between "auto" (= chat_completions, Hermes' +default) and "codex_app_server" (= hand turns to a codex subprocess). + +Both CLI (cli.py) and gateway (gateway/run.py) call into this module so the +behavior stays identical across surfaces. + +The actual runtime resolution happens in hermes_cli.runtime_provider's +_maybe_apply_codex_app_server_runtime() helper, which reads the persisted +config value. This module just persists the value and reports the change. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Optional + +logger = logging.getLogger(__name__) + + +VALID_RUNTIMES = ("auto", "codex_app_server") + + +@dataclass +class CodexRuntimeStatus: + """Result of a /codex-runtime invocation. Callers render this however + suits their surface (CLI uses Rich panels, gateway sends a text message).""" + + success: bool + new_value: Optional[str] = None + old_value: Optional[str] = None + message: str = "" + requires_new_session: bool = False + codex_binary_ok: bool = True + codex_version: Optional[str] = None + + +def parse_args(arg_string: str) -> tuple[Optional[str], list[str]]: + """Parse the slash-command argument string. Returns (value, errors). + + No args → return current state (value=None) + 'auto' / 'codex_app_server' / 'on' / 'off' → return that value + anything else → error + """ + raw = (arg_string or "").strip().lower() + if not raw: + return None, [] + # Accept human-friendly synonyms + if raw in ("on", "codex", "enable"): + return "codex_app_server", [] + if raw in ("off", "default", "disable", "hermes"): + return "auto", [] + if raw in VALID_RUNTIMES: + return raw, [] + return None, [ + f"Unknown runtime {raw!r}. Use one of: auto, codex_app_server, on, off" + ] + + +def get_current_runtime(config: dict) -> str: + """Read the current `model.openai_runtime` value from a config dict. + Returns 'auto' for unset / empty / unrecognized values.""" + if not isinstance(config, dict): + return "auto" + model_cfg = config.get("model") or {} + if not isinstance(model_cfg, dict): + return "auto" + value = str(model_cfg.get("openai_runtime") or "").strip().lower() + if value in VALID_RUNTIMES: + return value + return "auto" + + +def set_runtime(config: dict, new_value: str) -> str: + """Mutate the config dict in place to persist the new runtime value. + Returns the previous value for callers that want to report a delta.""" + if new_value not in VALID_RUNTIMES: + raise ValueError( + f"invalid runtime {new_value!r}; must be one of {VALID_RUNTIMES}" + ) + old = get_current_runtime(config) + if not isinstance(config.get("model"), dict): + config["model"] = {} + config["model"]["openai_runtime"] = new_value + return old + + +def check_codex_binary_ok() -> tuple[bool, Optional[str]]: + """Best-effort verification that codex CLI is installed at acceptable + version. Returns (ok, version_or_message).""" + try: + from agent.transports.codex_app_server import check_codex_binary + + return check_codex_binary() + except Exception as exc: # pragma: no cover + return False, f"codex check failed: {exc}" + + +def apply( + config: dict, + new_value: Optional[str], + *, + persist_callback=None, +) -> CodexRuntimeStatus: + """Top-level entry point used by both CLI and gateway handlers. + + Args: + config: in-memory config dict (will be mutated when new_value is set) + new_value: desired runtime; None means "show current state only" + persist_callback: optional callable taking the mutated config dict + and persisting it to disk. Skipped when None (used by tests). + + Returns: CodexRuntimeStatus describing the outcome. + """ + current = get_current_runtime(config) + + # Cache the codex binary check for this apply() call. Subprocess spawn + # is cheap (~50ms for `codex --version`), but we'd otherwise call it up + # to 3 times in the enable path (read-only/state, gate, success message). + # None = not yet checked; (bool, str) = result. + _binary_check: Optional[tuple[bool, Optional[str]]] = None + + def _check_binary_cached() -> tuple[bool, Optional[str]]: + nonlocal _binary_check + if _binary_check is None: + _binary_check = check_codex_binary_ok() + return _binary_check + + # Read-only call: just report state + if new_value is None: + ok, ver = _check_binary_cached() + msg = ( + f"openai_runtime: {current}\n" + f"codex CLI: {'OK ' + ver if ok else 'not available — ' + (ver or 'install with `npm i -g @openai/codex`')}" + ) + return CodexRuntimeStatus( + success=True, + new_value=current, + old_value=current, + message=msg, + codex_binary_ok=ok, + codex_version=ver if ok else None, + ) + + # No change requested + if new_value == current: + return CodexRuntimeStatus( + success=True, + new_value=current, + old_value=current, + message=f"openai_runtime already set to {current}", + ) + + # If switching ON, verify codex CLI is installed before persisting — + # an opt-in toggle that silently fails on the first turn is the + # worst possible UX. Block here with a clear install hint. + if new_value == "codex_app_server": + ok, ver_or_msg = _check_binary_cached() + if not ok: + return CodexRuntimeStatus( + success=False, + new_value=None, + old_value=current, + message=( + "Cannot enable codex_app_server runtime: " + f"{ver_or_msg or 'codex CLI not available'}\n" + "Install with: npm i -g @openai/codex" + ), + codex_binary_ok=False, + codex_version=None, + ) + + set_runtime(config, new_value) + if persist_callback is not None: + try: + persist_callback(config) + except Exception as exc: + logger.exception("failed to persist openai_runtime change") + return CodexRuntimeStatus( + success=False, + new_value=new_value, + old_value=current, + message=f"updated config in memory but persist failed: {exc}", + ) + + msg_lines = [ + f"openai_runtime: {current} → {new_value}", + ] + if new_value == "codex_app_server": + ok, ver = _check_binary_cached() + if ok: + msg_lines.append(f"codex CLI: {ver}") + # Auto-migrate Hermes' MCP servers + Codex's installed curated + # plugins into ~/.codex/config.toml so the spawned codex subprocess + # sees the same tool surface AND can call back into Hermes for + # browser/web/delegate_task/vision/memory tools (#7 fix). + # Failures are non-fatal — the runtime change still proceeds. + try: + from hermes_cli.codex_runtime_plugin_migration import migrate + mig_report = migrate(config) + # Tools/MCP servers (excluding the hermes-tools callback, + # which is internal plumbing — surface separately). + user_servers = [ + s for s in mig_report.migrated if s != "hermes-tools" + ] + if user_servers: + msg_lines.append( + f"Migrated {len(user_servers)} MCP server(s): " + f"{', '.join(user_servers)}" + ) + # Native Codex plugin migration (Linear, GitHub, etc.) + if mig_report.migrated_plugins: + msg_lines.append( + f"Migrated {len(mig_report.migrated_plugins)} native " + f"Codex plugin(s): {', '.join(mig_report.migrated_plugins)}" + ) + elif mig_report.plugin_query_error: + msg_lines.append( + f"Codex plugin discovery skipped: " + f"{mig_report.plugin_query_error}" + ) + # Permissions + Hermes tool callback are always-on production + # bits the user benefits from knowing about. + if mig_report.wrote_permissions_default: + msg_lines.append( + f"Default sandbox: {mig_report.wrote_permissions_default} " + f"(no approval prompt on every write)" + ) + if "hermes-tools" in mig_report.migrated: + msg_lines.append( + "Hermes tool callback registered: codex can now use " + "web_search, web_extract, browser_*, vision_analyze, " + "image_generate, skill_view, skills_list, text_to_speech, " + "kanban_* (worker + orchestrator) via MCP." + ) + msg_lines.append( + " (delegate_task, memory, session_search, todo run " + "only on the default Hermes runtime — they need the " + "agent loop context.)" + ) + msg_lines.append(f" (config: {mig_report.target_path})") + for err in mig_report.errors: + msg_lines.append(f"⚠ MCP migration: {err}") + except Exception as exc: + msg_lines.append(f"⚠ MCP migration skipped: {exc}") + msg_lines.append( + "OpenAI/Codex turns now run through `codex app-server` " + "(terminal/file ops/patching inside Codex; " + "Hermes tools available via MCP callback)." + ) + msg_lines.append( + "Effective on next session — current cached agent keeps " + "the prior runtime to preserve prompt cache." + ) + else: + msg_lines.append("OpenAI/Codex turns will use the default Hermes runtime.") + msg_lines.append("Effective on next session.") + return CodexRuntimeStatus( + success=True, + new_value=new_value, + old_value=current, + message="\n".join(msg_lines), + requires_new_session=True, + ) diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index 56a62c85a0a..62790bf9c14 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -120,6 +120,8 @@ COMMAND_REGISTRY: list[CommandDef] = [ cli_only=True), CommandDef("model", "Switch model for this session", "Configuration", aliases=("provider",), args_hint="[model] [--provider name] [--global]"), + CommandDef("codex-runtime", "Toggle codex app-server runtime for OpenAI/Codex models", + "Configuration", args_hint="[auto|codex_app_server]"), CommandDef("gquota", "Show Google Gemini Code Assist quota usage", "Info", cli_only=True), diff --git a/hermes_cli/runtime_provider.py b/hermes_cli/runtime_provider.py index 1652b72034c..4ac21ea4568 100644 --- a/hermes_cli/runtime_provider.py +++ b/hermes_cli/runtime_provider.py @@ -164,7 +164,18 @@ def _copilot_runtime_api_mode(model_cfg: Dict[str, Any], api_key: str) -> str: return "chat_completions" -_VALID_API_MODES = {"chat_completions", "codex_responses", "anthropic_messages", "bedrock_converse"} +_VALID_API_MODES = { + "chat_completions", + "codex_responses", + "anthropic_messages", + "bedrock_converse", + # Optional opt-in: hand the entire turn to a `codex app-server` subprocess + # so terminal/file-ops/patching/sandboxing run inside Codex's own runtime + # instead of Hermes' tool dispatch. Gated behind config key + # `model.openai_runtime == "codex_app_server"` AND provider in + # {"openai", "openai-codex"}. Default is unchanged. + "codex_app_server", +} def _parse_api_mode(raw: Any) -> Optional[str]: @@ -176,6 +187,32 @@ def _parse_api_mode(raw: Any) -> Optional[str]: return None +def _maybe_apply_codex_app_server_runtime( + *, + provider: str, + api_mode: str, + model_cfg: Optional[Dict[str, Any]], +) -> str: + """Optional opt-in: rewrite api_mode → "codex_app_server" for OpenAI/Codex + providers when the user has explicitly enabled that runtime via + `model.openai_runtime: codex_app_server` in config.yaml. + + Default behavior is preserved: when the key is unset, "auto", or empty, + this function is a no-op. Only providers in {"openai", "openai-codex"} + are eligible — other providers (anthropic, openrouter, etc.) cannot be + rerouted through codex. + + Returns the (possibly-rewritten) api_mode.""" + if not model_cfg: + return api_mode + if provider not in ("openai", "openai-codex"): + return api_mode + runtime = str(model_cfg.get("openai_runtime") or "").strip().lower() + if runtime == "codex_app_server": + return "codex_app_server" + return api_mode + + def _resolve_runtime_from_pool_entry( *, provider: str, @@ -293,6 +330,12 @@ def _resolve_runtime_from_pool_entry( if api_mode == "anthropic_messages" and provider in {"opencode-zen", "opencode-go"}: base_url = re.sub(r"/v1/?$", "", base_url) + # Optional opt-in: route OpenAI/Codex turns through `codex app-server`. + # Inert when `model.openai_runtime` is unset or "auto". + api_mode = _maybe_apply_codex_app_server_runtime( + provider=provider, api_mode=api_mode, model_cfg=model_cfg + ) + return { "provider": provider, "api_mode": api_mode, diff --git a/run_agent.py b/run_agent.py index f2f3379e0d7..f9eaee85af6 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1271,7 +1271,7 @@ class AIAgent: self.provider = provider_name or "" self.acp_command = acp_command or command self.acp_args = list(acp_args or args or []) - if api_mode in {"chat_completions", "codex_responses", "anthropic_messages", "bedrock_converse"}: + if api_mode in {"chat_completions", "codex_responses", "anthropic_messages", "bedrock_converse", "codex_app_server"}: self.api_mode = api_mode elif self.provider == "openai-codex": self.api_mode = "codex_responses" @@ -4267,13 +4267,24 @@ class AIAgent: # reconstruct auth from scratch -- producing the spurious # "No LLM provider configured" warning at end of turn. _parent_runtime = self._current_main_runtime() + _parent_api_mode = _parent_runtime.get("api_mode") or None + # The review fork needs to call agent-loop tools (memory, + # skill_manage). Those tools require Hermes' own dispatch, + # which the codex_app_server runtime bypasses entirely + # (it runs the turn inside codex's subprocess). So when + # the parent is on codex_app_server, downgrade the review + # fork to codex_responses — same auth/credentials, but + # talks to the OpenAI Responses API directly so Hermes + # owns the loop and the agent-loop tools dispatch. + if _parent_api_mode == "codex_app_server": + _parent_api_mode = "codex_responses" review_agent = AIAgent( model=self.model, max_iterations=16, quiet_mode=True, platform=self.platform, provider=self.provider, - api_mode=_parent_runtime.get("api_mode") or None, + api_mode=_parent_api_mode, base_url=_parent_runtime.get("base_url") or None, api_key=_parent_runtime.get("api_key") or None, credential_pool=getattr(self, "_credential_pool", None), @@ -12115,6 +12126,20 @@ class AIAgent: except Exception: pass + # Optional opt-in runtime: if api_mode == codex_app_server, hand the + # turn to the codex app-server subprocess (terminal/file ops/patching + # all run inside Codex). Default Hermes path is bypassed entirely. + # See agent/transports/codex_app_server_session.py for the adapter + # and references/codex-app-server-runtime.md for the rationale. + if self.api_mode == "codex_app_server": + return self._run_codex_app_server_turn( + user_message=user_message, + original_user_message=original_user_message, + messages=messages, + effective_task_id=effective_task_id, + should_review_memory=_should_review_memory, + ) + while (api_call_count < self.max_iterations and self.iteration_budget.remaining > 0) or self._budget_grace_call: # Reset per-turn checkpoint dedup so each iteration can take one snapshot self._checkpoint_mgr.new_turn() @@ -15554,6 +15579,130 @@ class AIAgent: result = self.run_conversation(message, stream_callback=stream_callback) return result["final_response"] + def _run_codex_app_server_turn( + self, + *, + user_message: str, + original_user_message: Any, + messages: List[Dict[str, Any]], + effective_task_id: str, + should_review_memory: bool = False, + ) -> Dict[str, Any]: + """Codex app-server runtime path. Hands the entire turn to a `codex + app-server` subprocess and projects its events back into Hermes' + messages list so memory/skill review keep working. + + Called from run_conversation() when self.api_mode == "codex_app_server". + Returns the same dict shape as the chat_completions path. + """ + from agent.transports.codex_app_server_session import CodexAppServerSession + + # Lazy session: one CodexAppServerSession per AIAgent instance. + # Spawned on first turn, reused across turns, closed at AIAgent + # shutdown (see _cleanup hook). + if not hasattr(self, "_codex_session") or self._codex_session is None: + cwd = getattr(self, "session_cwd", None) or os.getcwd() + # Approval callback: defer to Hermes' standard prompt flow if a + # CLI thread has installed one. Gateway / cron contexts get the + # codex-side fail-closed default. + try: + from tools.terminal_tool import _get_approval_callback + approval_callback = _get_approval_callback() + except Exception: + approval_callback = None + self._codex_session = CodexAppServerSession( + cwd=cwd, + approval_callback=approval_callback, + ) + + # NOTE: the user message is ALREADY appended to messages by the + # standard run_conversation() flow (line ~11823) before the early + # return reaches us. Do NOT append again — that would duplicate. + + try: + turn = self._codex_session.run_turn(user_input=user_message) + except Exception as exc: + logger.exception("codex app-server turn failed") + return { + "final_response": ( + f"Codex app-server turn failed: {exc}. " + f"Fall back to default runtime with `/codex-runtime auto`." + ), + "messages": messages, + "api_calls": 0, + "completed": False, + "partial": True, + "error": str(exc), + } + + # Splice projected messages into the conversation. The projector emits + # standard {role, content, tool_calls, tool_call_id} entries, which + # is exactly what curator.py / sessions DB expect. + if turn.projected_messages: + messages.extend(turn.projected_messages) + + # Counter ticks for the self-improvement loop. + # _turns_since_memory and _user_turn_count are ALREADY incremented + # in the run_conversation() pre-loop block (lines ~11793-11817) so we + # do NOT touch them here — that would double-count. + # Only _iters_since_skill needs explicit increment, since the + # chat_completions loop bumps it per tool iteration (line ~12110) + # and that loop is bypassed on this path. + self._iters_since_skill = ( + getattr(self, "_iters_since_skill", 0) + turn.tool_iterations + ) + + # Now check the skill nudge AFTER iters were incremented — same + # pattern the chat_completions path uses (line ~15432). + should_review_skills = False + if ( + self._skill_nudge_interval > 0 + and self._iters_since_skill >= self._skill_nudge_interval + and "skill_manage" in self.valid_tool_names + ): + should_review_skills = True + self._iters_since_skill = 0 + + # External memory provider sync (mirrors line ~15439). Skipped on + # interrupt/error to avoid feeding partial transcripts to memory. + if not turn.interrupted and turn.error is None: + try: + self._sync_external_memory_for_turn( + original_user_message=original_user_message, + final_response=turn.final_text, + interrupted=False, + ) + except Exception: + logger.debug("external memory sync raised", exc_info=True) + + # Background review fork — same cadence + signature as the default + # path (line ~15449). Only fires when a trigger actually tripped AND + # we have a real final response. + if ( + turn.final_text + and not turn.interrupted + and (should_review_memory or should_review_skills) + ): + try: + self._spawn_background_review( + messages_snapshot=list(messages), + review_memory=should_review_memory, + review_skills=should_review_skills, + ) + except Exception: + logger.debug("background review spawn raised", exc_info=True) + + return { + "final_response": turn.final_text, + "messages": messages, + "api_calls": 1, # one app-server "turn" maps to one logical API call + "completed": not turn.interrupted and turn.error is None, + "partial": turn.interrupted or turn.error is not None, + "error": turn.error, + "codex_thread_id": turn.thread_id, + "codex_turn_id": turn.turn_id, + } + def main( query: str = None, diff --git a/tests/agent/transports/test_codex_app_server_runtime.py b/tests/agent/transports/test_codex_app_server_runtime.py new file mode 100644 index 00000000000..d12ac227254 --- /dev/null +++ b/tests/agent/transports/test_codex_app_server_runtime.py @@ -0,0 +1,243 @@ +"""Tests for the optional codex app-server runtime gate. + +These are unit tests for the api_mode rewriter and the wire-level transport +module. They do NOT require the `codex` CLI to be installed — that's +covered by a separate live test gated on `codex --version`. +""" + +from __future__ import annotations + +import pytest + +from hermes_cli.runtime_provider import ( + _VALID_API_MODES, + _maybe_apply_codex_app_server_runtime, +) + + +class TestApiModeRegistration: + """The new api_mode must be registered or downstream parsing rejects it.""" + + def test_codex_app_server_is_a_valid_api_mode(self) -> None: + assert "codex_app_server" in _VALID_API_MODES + + def test_existing_api_modes_still_present(self) -> None: + # Regression guard: don't accidentally delete other api_modes when + # touching this set. + for mode in ( + "chat_completions", + "codex_responses", + "anthropic_messages", + "bedrock_converse", + ): + assert mode in _VALID_API_MODES + + +class TestMaybeApplyCodexAppServerRuntime: + """The opt-in helper that rewrites api_mode → codex_app_server.""" + + @pytest.mark.parametrize( + "model_cfg", + [ + None, + {}, + {"openai_runtime": ""}, + {"openai_runtime": "auto"}, + {"openai_runtime": "AUTO"}, + {"other_key": "codex_app_server"}, # wrong key + ], + ) + def test_default_off_for_openai(self, model_cfg) -> None: + """Default behavior is preserved when the flag is unset/auto.""" + got = _maybe_apply_codex_app_server_runtime( + provider="openai", api_mode="chat_completions", model_cfg=model_cfg + ) + assert got == "chat_completions" + + def test_opt_in_rewrites_openai(self) -> None: + got = _maybe_apply_codex_app_server_runtime( + provider="openai", + api_mode="chat_completions", + model_cfg={"openai_runtime": "codex_app_server"}, + ) + assert got == "codex_app_server" + + def test_opt_in_rewrites_openai_codex(self) -> None: + got = _maybe_apply_codex_app_server_runtime( + provider="openai-codex", + api_mode="codex_responses", + model_cfg={"openai_runtime": "codex_app_server"}, + ) + assert got == "codex_app_server" + + def test_case_insensitive(self) -> None: + got = _maybe_apply_codex_app_server_runtime( + provider="openai", + api_mode="chat_completions", + model_cfg={"openai_runtime": "Codex_App_Server"}, + ) + assert got == "codex_app_server" + + @pytest.mark.parametrize( + "provider", + [ + "anthropic", + "openrouter", + "xai", + "qwen-oauth", + "google-gemini-cli", + "opencode-zen", + "bedrock", + "", + ], + ) + def test_other_providers_never_rerouted(self, provider) -> None: + """Non-OpenAI providers MUST NOT be rerouted even with the flag set — + codex's app-server can only run OpenAI/Codex auth flows.""" + got = _maybe_apply_codex_app_server_runtime( + provider=provider, + api_mode="anthropic_messages", + model_cfg={"openai_runtime": "codex_app_server"}, + ) + assert got == "anthropic_messages", ( + f"provider={provider!r} should not be rerouted to codex_app_server" + ) + + +class TestCodexAppServerModule: + """Module-surface tests for the JSON-RPC speaker. Don't require codex CLI.""" + + def test_module_imports(self) -> None: + from agent.transports import codex_app_server + + assert codex_app_server.MIN_CODEX_VERSION >= (0, 1, 0) + assert callable(codex_app_server.parse_codex_version) + assert callable(codex_app_server.check_codex_binary) + + def test_parse_codex_version_valid(self) -> None: + from agent.transports.codex_app_server import parse_codex_version + + assert parse_codex_version("codex-cli 0.130.0") == (0, 130, 0) + assert parse_codex_version("codex-cli 1.2.3 (extra metadata)") == (1, 2, 3) + assert parse_codex_version("codex 99.0.1\n") == (99, 0, 1) + + def test_parse_codex_version_invalid(self) -> None: + from agent.transports.codex_app_server import parse_codex_version + + assert parse_codex_version("nope") is None + assert parse_codex_version("") is None + assert parse_codex_version(None) is None # type: ignore[arg-type] + + def test_check_binary_handles_missing_executable(self) -> None: + from agent.transports.codex_app_server import check_codex_binary + + ok, msg = check_codex_binary(codex_bin="/nonexistent/codex/binary/path") + assert ok is False + assert "not found" in msg.lower() or "no such" in msg.lower() + + def test_codex_error_class_is_runtimeerror(self) -> None: + from agent.transports.codex_app_server import CodexAppServerError + + err = CodexAppServerError(code=-32600, message="boom") + assert isinstance(err, RuntimeError) + assert "boom" in str(err) + assert "-32600" in str(err) + + +class TestSpawnEnvIsolation: + """The codex spawn must NOT rewrite HOME — codex's shell tool spawns + subprocesses (gh, git, npm, aws, gcloud, ...) that need to find their + config in the real user $HOME. CODEX_HOME isolates codex's own state, + HOME stays unchanged. + + OpenClaw hit this footgun (openclaw/openclaw#81562) — they were + rewriting HOME to a synthetic per-agent dir alongside CODEX_HOME, + and then `gh auth status` / git config / etc. all broke inside codex + shell calls. We avoid the same bug by only overlaying CODEX_HOME and + RUST_LOG on top of os.environ.copy(). + """ + + def test_spawn_env_preserves_HOME(self, monkeypatch): + """The spawn env must contain the parent process's HOME unchanged. + Verifies via a subprocess-monkey-patch.""" + import subprocess + from agent.transports import codex_app_server as cas + + captured = {} + + class FakePopen: + def __init__(self, cmd, *args, **kwargs): + captured["env"] = kwargs.get("env", {}).copy() + # Provide minimal Popen surface so __init__ doesn't crash + # on attribute access during construction. + self.stdin = None + self.stdout = None + self.stderr = None + self.pid = 1 + self.returncode = None + + def poll(self): + return None + + def terminate(self): + pass + + def wait(self, timeout=None): + return 0 + + def kill(self): + pass + + monkeypatch.setattr(subprocess, "Popen", FakePopen) + monkeypatch.setenv("HOME", "/users/alice") + + client = cas.CodexAppServerClient(codex_bin="codex") + client._closed = True # so close() is a no-op + + # The spawn env must have HOME=/users/alice unchanged + assert captured["env"].get("HOME") == "/users/alice", ( + f"HOME got rewritten in codex spawn env: " + f"{captured['env'].get('HOME')!r}. Codex's shell tool's " + "subprocesses (gh, git, aws, npm) need the user's real HOME." + ) + + def test_spawn_env_sets_CODEX_HOME_when_provided(self, monkeypatch): + """CODEX_HOME isolation must still work — that's the whole point + of the codex_home arg.""" + import subprocess + from agent.transports import codex_app_server as cas + + captured = {} + + class FakePopen: + def __init__(self, cmd, *args, **kwargs): + captured["env"] = kwargs.get("env", {}).copy() + self.stdin = None + self.stdout = None + self.stderr = None + self.pid = 1 + self.returncode = None + + def poll(self): + return None + + def terminate(self): + pass + + def wait(self, timeout=None): + return 0 + + def kill(self): + pass + + monkeypatch.setattr(subprocess, "Popen", FakePopen) + monkeypatch.setenv("HOME", "/users/alice") + + client = cas.CodexAppServerClient( + codex_bin="codex", codex_home="/tmp/profile/codex" + ) + client._closed = True + + assert captured["env"].get("CODEX_HOME") == "/tmp/profile/codex" + # And HOME still passes through unchanged + assert captured["env"].get("HOME") == "/users/alice" diff --git a/tests/agent/transports/test_codex_app_server_session.py b/tests/agent/transports/test_codex_app_server_session.py new file mode 100644 index 00000000000..de0b2f60cb8 --- /dev/null +++ b/tests/agent/transports/test_codex_app_server_session.py @@ -0,0 +1,502 @@ +"""Tests for CodexAppServerSession — drive turns through a mock client. + +The session adapter has the most complex behavior of the three new modules: +notification draining, server-request handling (approvals), interrupt, +deadline timeouts. These tests pin all of that without spawning real codex. +""" + +from __future__ import annotations + +import threading +import time +from typing import Any, Optional + +import pytest + +from agent.transports.codex_app_server_session import ( + CodexAppServerSession, + TurnResult, + _ServerRequestRouting, + _approval_choice_to_codex_decision, +) + + +class FakeClient: + """Stand-in for CodexAppServerClient that records calls and lets the test + drive the notification / server-request streams synchronously.""" + + def __init__(self, *, codex_bin: str = "codex", codex_home=None) -> None: + self.codex_bin = codex_bin + self.codex_home = codex_home + self.requests: list[tuple[str, dict]] = [] + self.notifications_responses: list[dict] = [] + self.responses: list[tuple[Any, dict]] = [] + self.error_responses: list[tuple[Any, int, str]] = [] + self._initialized = False + self._closed = False + self._notifications: list[dict] = [] + self._server_requests: list[dict] = [] + self._request_handler = None # Optional[Callable[[str, dict], dict]] + + # API matching CodexAppServerClient + def initialize(self, **kwargs): + self._initialized = True + return {"userAgent": "fake/0.0.0", "codexHome": "/tmp", + "platformOs": "linux", "platformFamily": "unix"} + + def request(self, method: str, params: Optional[dict] = None, timeout: float = 30.0): + self.requests.append((method, params or {})) + if self._request_handler is not None: + return self._request_handler(method, params or {}) + # Sensible defaults for protocol methods used by the session + if method == "thread/start": + return {"thread": {"id": "thread-fake-001"}, + "activePermissionProfile": {"id": "workspace-write"}} + if method == "turn/start": + return {"turn": {"id": "turn-fake-001"}} + if method == "turn/interrupt": + return {} + return {} + + def notify(self, method: str, params=None): + pass + + def respond(self, request_id, result): + self.responses.append((request_id, result)) + + def respond_error(self, request_id, code, message, data=None): + self.error_responses.append((request_id, code, message)) + + def take_notification(self, timeout: float = 0.0): + if self._notifications: + return self._notifications.pop(0) + # Honor a tiny sleep so the loop doesn't hot-spin; the real client + # blocks on a queue. For tests we want determinism. + if timeout > 0: + time.sleep(min(timeout, 0.001)) + return None + + def take_server_request(self, timeout: float = 0.0): + if self._server_requests: + return self._server_requests.pop(0) + return None + + def close(self): + self._closed = True + + # Test helpers + def queue_notification(self, method: str, **params): + self._notifications.append({"method": method, "params": params}) + + def queue_server_request(self, method: str, request_id: Any = "srv-1", **params): + self._server_requests.append({"id": request_id, "method": method, "params": params}) + + +def make_session(client: FakeClient, **kwargs) -> CodexAppServerSession: + return CodexAppServerSession( + cwd="/tmp", + client_factory=lambda **kw: client, + **kwargs, + ) + + +# ---- choice mapping ---- + +class TestApprovalChoiceMapping: + @pytest.mark.parametrize("choice,expected", [ + ("once", "accept"), + ("session", "acceptForSession"), + ("always", "acceptForSession"), + ("deny", "decline"), + ("anything-else", "decline"), + ]) + def test_mapping(self, choice, expected): + assert _approval_choice_to_codex_decision(choice) == expected + + +# ---- lifecycle ---- + +class TestLifecycle: + def test_ensure_started_is_idempotent(self): + client = FakeClient() + s = make_session(client) + tid_a = s.ensure_started() + tid_b = s.ensure_started() + assert tid_a == tid_b == "thread-fake-001" + # thread/start should be called exactly once + method_calls = [m for (m, _) in client.requests if m == "thread/start"] + assert len(method_calls) == 1 + + def test_thread_start_passes_cwd_only(self): + """thread/start carries cwd. We intentionally do NOT pass `permissions` + on this codex version (experimentalApi-gated + requires matching + config.toml [permissions] table). Letting codex use its default + (read-only unless user configures otherwise) is the documented path.""" + client = FakeClient() + s = make_session(client, permission_profile="workspace-write") + s.ensure_started() + method, params = next(r for r in client.requests if r[0] == "thread/start") + assert params["cwd"] == "/tmp" + assert "permissions" not in params # see session.ensure_started() comment + + def test_close_idempotent(self): + client = FakeClient() + s = make_session(client) + s.ensure_started() + s.close() + s.close() + assert client._closed is True + + +# ---- turn loop ---- + +class TestRunTurn: + def test_simple_text_turn_returns_final_message(self): + client = FakeClient() + client.queue_notification("turn/started", threadId="t", turn={"id": "tu1"}) + client.queue_notification( + "item/completed", + item={"type": "agentMessage", "id": "m1", "text": "hello world"}, + threadId="t", turnId="tu1", + ) + client.queue_notification( + "turn/completed", + threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + s = make_session(client) + r = s.run_turn("hi", turn_timeout=2.0) + assert r.final_text == "hello world" + assert r.interrupted is False + assert r.error is None + assert any(m["role"] == "assistant" and m.get("content") == "hello world" + for m in r.projected_messages) + # turn_id propagated for downstream session-DB linkage + assert r.turn_id == "turn-fake-001" + + def test_tool_iteration_counter_ticks(self): + client = FakeClient() + # Two completed exec items + one final agent message + for i, item_id in enumerate(("ex1", "ex2"), start=1): + client.queue_notification( + "item/completed", + item={ + "type": "commandExecution", "id": item_id, + "command": f"cmd{i}", "cwd": "/tmp", + "status": "completed", "aggregatedOutput": "ok", + "exitCode": 0, "commandActions": [], + }, + threadId="t", turnId="tu1", + ) + client.queue_notification( + "item/completed", + item={"type": "agentMessage", "id": "m1", "text": "done"}, + threadId="t", turnId="tu1", + ) + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + s = make_session(client) + r = s.run_turn("do stuff", turn_timeout=2.0) + assert r.tool_iterations == 2 + # Each tool item produces (assistant, tool) — 2*2 + final assistant = 5 msgs + assert len(r.projected_messages) == 5 + + def test_turn_start_failure_returns_error(self): + client = FakeClient() + from agent.transports.codex_app_server import CodexAppServerError + + def boom(method, params): + if method == "turn/start": + raise CodexAppServerError(code=-32600, message="bad input") + return {"thread": {"id": "t"}, "activePermissionProfile": {"id": "x"}} + + client._request_handler = boom + s = make_session(client) + r = s.run_turn("hi", turn_timeout=2.0) + assert r.error is not None + assert "bad input" in r.error + assert r.final_text == "" + + def test_interrupt_during_turn_issues_turn_interrupt(self): + client = FakeClient() + # Don't queue turn/completed — the loop has to interrupt out + client.queue_notification( + "item/completed", + item={"type": "commandExecution", "id": "x", "command": "sleep 60", + "cwd": "/", "status": "inProgress", + "aggregatedOutput": None, "exitCode": None, + "commandActions": []}, + threadId="t", turnId="tu1", + ) + s = make_session(client) + s.ensure_started() + # Trip the interrupt before run_turn even consumes the notification. + # The loop will see interrupt set on its first iteration and bail. + s.request_interrupt() + r = s.run_turn("loop forever", turn_timeout=2.0) + assert r.interrupted is True + # turn/interrupt was requested with the right turnId + assert any( + method == "turn/interrupt" and params.get("turnId") == "turn-fake-001" + for (method, params) in client.requests + ) + + def test_deadline_exceeded_records_error(self): + client = FakeClient() + # No notifications and no completion → must hit deadline + s = make_session(client) + r = s.run_turn("never finishes", turn_timeout=0.05, + notification_poll_timeout=0.01) + assert r.interrupted is True + assert r.error and "timed out" in r.error + + def test_failed_turn_records_error_from_turn_completed(self): + client = FakeClient() + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "failed", + "error": {"message": "model error"}}, + ) + s = make_session(client) + r = s.run_turn("x", turn_timeout=1.0) + assert r.error and "model error" in r.error + + +# ---- approval bridge ---- + +class TestServerRequestRouting: + def test_exec_approval_with_callback_approves_once(self): + client = FakeClient() + client.queue_server_request( + "item/commandExecution/requestApproval", request_id="req-1", + command="ls /tmp", cwd="/tmp", + ) + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + + captured: dict = {} + + def cb(command, description, *, allow_permanent=True): + captured["command"] = command + captured["description"] = description + return "once" + + s = make_session(client, approval_callback=cb) + s.run_turn("hi", turn_timeout=1.0) + assert captured["command"] == "ls /tmp" + # The session must have responded to the server request with "accept" + assert ("req-1", {"decision": "accept"}) in client.responses + + def test_exec_approval_no_callback_denies(self): + client = FakeClient() + client.queue_server_request("item/commandExecution/requestApproval", request_id="req-1", + command="rm -rf /", cwd="/") + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + s = make_session(client) # no approval_callback wired + s.run_turn("hi", turn_timeout=1.0) + assert ("req-1", {"decision": "decline"}) in client.responses + + def test_apply_patch_approval_session_maps_to_session_decision(self): + client = FakeClient() + client.queue_server_request( + "item/fileChange/requestApproval", request_id="req-2", + itemId="fc-1", + turnId="t1", + threadId="th", + startedAtMs=1234567890, + reason="create new file with hello() function", + ) + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + + def cb(command, description, *, allow_permanent=True): + return "session" + + s = make_session(client, approval_callback=cb) + s.run_turn("hi", turn_timeout=1.0) + assert ("req-2", {"decision": "acceptForSession"}) in client.responses + + def test_unknown_server_request_replied_with_error(self): + client = FakeClient() + client.queue_server_request("totally/unknown", request_id="req-3") + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + s = make_session(client) + s.run_turn("hi", turn_timeout=1.0) + assert any( + rid == "req-3" and code == -32601 + for (rid, code, _msg) in client.error_responses + ) + + def test_mcp_elicitation_for_hermes_tools_auto_accepts(self): + """When codex elicits on behalf of hermes-tools (our own callback), + accept automatically — the user already opted in by enabling the + runtime.""" + client = FakeClient() + client.queue_server_request( + "mcpServer/elicitation/request", request_id="elic-1", + threadId="t", turnId="tu1", + serverName="hermes-tools", + mode="form", + message="confirm", + requestedSchema={"type": "object", "properties": {}}, + ) + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + s = make_session(client) + s.run_turn("hi", turn_timeout=1.0) + assert ("elic-1", {"action": "accept", "content": None, "_meta": None}) in client.responses + + def test_mcp_elicitation_for_other_servers_declines(self): + """For third-party MCP servers we decline by default so users + explicitly opt in through codex's own UI.""" + client = FakeClient() + client.queue_server_request( + "mcpServer/elicitation/request", request_id="elic-2", + threadId="t", turnId="tu1", + serverName="some-third-party", + mode="url", + message="please log in", + url="https://example.com/oauth", + ) + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + s = make_session(client) + s.run_turn("hi", turn_timeout=1.0) + assert ("elic-2", {"action": "decline", "content": None, "_meta": None}) in client.responses + + def test_routing_auto_approve_bypass(self): + client = FakeClient() + client.queue_server_request("item/commandExecution/requestApproval", request_id="r1", + command="ls", cwd="/") + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + # No callback, but routing says auto-approve. Should approve. + s = make_session(client, request_routing=_ServerRequestRouting( + auto_approve_exec=True)) + s.run_turn("hi", turn_timeout=1.0) + assert ("r1", {"decision": "accept"}) in client.responses + + def test_callback_raises_falls_back_to_decline(self): + client = FakeClient() + client.queue_server_request("item/commandExecution/requestApproval", request_id="r1", + command="ls", cwd="/") + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + + def boom(*a, **kw): + raise RuntimeError("ui crashed") + + s = make_session(client, approval_callback=boom) + s.run_turn("hi", turn_timeout=1.0) + # Fail-closed: deny on callback exception + assert ("r1", {"decision": "decline"}) in client.responses + + +# ---- enriched approval prompts ---- + +class TestApprovalPromptEnrichment: + """Quirk #4: apply_patch prompt should show what's changing. + Quirk #10: exec prompt should never show empty cwd.""" + + def test_exec_falls_back_to_session_cwd(self): + """When codex omits cwd from the approval params, the prompt shows + the session cwd, not an empty string.""" + client = FakeClient() + client.queue_server_request( + "item/commandExecution/requestApproval", request_id="r1", + command="ls", # no cwd + ) + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + captured = {} + def cb(command, description, *, allow_permanent=True): + captured["description"] = description + return "once" + s = make_session(client, approval_callback=cb) + s.run_turn("hi", turn_timeout=1.0) + # Session cwd is /tmp by default in make_session() + assert "/tmp" in captured["description"] + assert "Codex requests exec in " not in captured["description"] + + def test_apply_patch_prompt_summarizes_pending_changes(self): + """When the projector has cached the fileChange item from item/started, + the approval prompt surfaces the change summary.""" + client = FakeClient() + # item/started fires first (carries the changes), then approval request + client.queue_notification( + "item/started", + item={"type": "fileChange", "id": "fc-1", + "changes": [ + {"kind": {"type": "add"}, "path": "/tmp/new.py"}, + {"kind": {"type": "update"}, "path": "/tmp/old.py"}, + ]}, + threadId="t", turnId="tu1", + ) + client.queue_server_request( + "item/fileChange/requestApproval", request_id="req-2", + itemId="fc-1", turnId="tu1", threadId="t", + startedAtMs=1234567890, + reason="add and update files", + ) + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + captured = {} + def cb(command, description, *, allow_permanent=True): + captured["command"] = command + captured["description"] = description + return "once" + s = make_session(client, approval_callback=cb) + s.run_turn("hi", turn_timeout=1.0) + # Both add and update kinds should be in the summary + assert "1 add" in captured["command"] or "1 add" in captured["description"] + assert "1 update" in captured["command"] or "1 update" in captured["description"] + # And at least one of the paths + joined = captured["command"] + " " + captured["description"] + assert "/tmp/new.py" in joined or "/tmp/old.py" in joined + + def test_apply_patch_prompt_works_without_cached_summary(self): + """When approval arrives before item/started (or without changes + info), prompt falls back to whatever codex provided.""" + client = FakeClient() + client.queue_server_request( + "item/fileChange/requestApproval", request_id="req-2", + itemId="fc-orphan", turnId="tu1", threadId="t", + startedAtMs=1234567890, + reason="apply some changes", + ) + client.queue_notification( + "turn/completed", threadId="t", + turn={"id": "tu1", "status": "completed", "error": None}, + ) + captured = {} + def cb(command, description, *, allow_permanent=True): + captured["command"] = command + return "once" + s = make_session(client, approval_callback=cb) + s.run_turn("hi", turn_timeout=1.0) + # Falls back to the reason + assert "apply some changes" in captured["command"] diff --git a/tests/agent/transports/test_codex_event_projector.py b/tests/agent/transports/test_codex_event_projector.py new file mode 100644 index 00000000000..04980f35c61 --- /dev/null +++ b/tests/agent/transports/test_codex_event_projector.py @@ -0,0 +1,303 @@ +"""Tests for CodexEventProjector — codex item/* events → Hermes messages list. + +Drives projection against fixture notifications captured from codex 0.130.0 +plus synthetic ones for item types we couldn't auth-test live.""" + +from __future__ import annotations + +import json + +import pytest + +from agent.transports.codex_event_projector import ( + CodexEventProjector, + ProjectionResult, + _deterministic_call_id, + _format_tool_args, +) + + +# --- Fixture: real `commandExecution` notification captured from codex 0.130.0 +COMMAND_EXEC_COMPLETED = { + "method": "item/completed", + "params": { + "item": { + "type": "commandExecution", + "id": "f8a75c66-a89e-4fd7-8bcf-2d58e664fa9e", + "command": "/bin/bash -lc 'echo hello && ls /tmp | head -3'", + "cwd": "/tmp", + "processId": None, + "source": "userShell", + "status": "completed", + "commandActions": [ + {"type": "listFiles", "command": "ls /tmp", "path": "tmp"} + ], + "aggregatedOutput": "hello\naa_lang.json\n", + "exitCode": 0, + "durationMs": 10, + }, + "threadId": "019e1a94-352b-71e1-b214-e5c67c9ec190", + "turnId": "019e1a94-3553-7940-8af3-4ca57142deb7", + "completedAtMs": 1778562381151, + }, +} + + +class TestProjectionInvariants: + """Universal invariants that must hold across all projection paths.""" + + def test_streaming_deltas_dont_materialize(self) -> None: + p = CodexEventProjector() + for delta_method in ( + "item/commandExecution/outputDelta", + "item/agentMessage/delta", + "item/reasoning/delta", + ): + r = p.project({"method": delta_method, "params": {"delta": "x"}}) + assert r.messages == [], ( + f"{delta_method} should NOT produce messages — only " + f"item/completed materializes" + ) + assert r.is_tool_iteration is False + assert r.final_text is None + + def test_turn_started_and_completed_are_silent(self) -> None: + p = CodexEventProjector() + for method in ("turn/started", "turn/completed", "thread/started"): + r = p.project({"method": method, "params": {}}) + assert r.messages == [] + + def test_unknown_method_silent(self) -> None: + p = CodexEventProjector() + r = p.project({"method": "totally/unknown", "params": {}}) + assert r.messages == [] + + +class TestCommandExecutionProjection: + """Real captured notification → assistant tool_call + tool result.""" + + def test_command_completed_produces_two_messages(self) -> None: + p = CodexEventProjector() + r = p.project(COMMAND_EXEC_COMPLETED) + assert len(r.messages) == 2 + assert r.is_tool_iteration is True + + def test_first_message_is_assistant_tool_call(self) -> None: + p = CodexEventProjector() + msgs = p.project(COMMAND_EXEC_COMPLETED).messages + assistant = msgs[0] + assert assistant["role"] == "assistant" + assert assistant["content"] is None + assert len(assistant["tool_calls"]) == 1 + tc = assistant["tool_calls"][0] + assert tc["type"] == "function" + assert tc["function"]["name"] == "exec_command" + args = json.loads(tc["function"]["arguments"]) + assert "echo hello" in args["command"] + assert args["cwd"] == "/tmp" + + def test_second_message_is_tool_result_correlating_by_id(self) -> None: + p = CodexEventProjector() + msgs = p.project(COMMAND_EXEC_COMPLETED).messages + assistant, tool = msgs + assert tool["role"] == "tool" + assert tool["tool_call_id"] == assistant["tool_calls"][0]["id"] + assert "hello" in tool["content"] + + def test_nonzero_exit_code_annotated_in_tool_result(self) -> None: + item = {**COMMAND_EXEC_COMPLETED["params"]["item"], "exitCode": 2, + "aggregatedOutput": "boom"} + notif = { + "method": "item/completed", + "params": {**COMMAND_EXEC_COMPLETED["params"], "item": item}, + } + p = CodexEventProjector() + msgs = p.project(notif).messages + assert "[exit 2]" in msgs[1]["content"] + assert "boom" in msgs[1]["content"] + + def test_deterministic_call_id_across_replay(self) -> None: + # Same item id → same call_id (prefix cache must stay valid). + p1 = CodexEventProjector() + p2 = CodexEventProjector() + a = p1.project(COMMAND_EXEC_COMPLETED).messages + b = p2.project(COMMAND_EXEC_COMPLETED).messages + assert a[0]["tool_calls"][0]["id"] == b[0]["tool_calls"][0]["id"] + + +class TestAgentMessageProjection: + """assistant text → final_text + assistant message.""" + + def test_agent_message_projects_to_assistant(self) -> None: + p = CodexEventProjector() + r = p.project({ + "method": "item/completed", + "params": {"item": {"type": "agentMessage", "id": "x", + "text": "hi there"}}, + }) + assert r.final_text == "hi there" + assert r.messages == [{"role": "assistant", "content": "hi there"}] + assert r.is_tool_iteration is False + + def test_pending_reasoning_attaches_to_next_assistant_message(self) -> None: + p = CodexEventProjector() + # First a reasoning item lands + r1 = p.project({ + "method": "item/completed", + "params": {"item": {"type": "reasoning", "id": "r1", + "summary": ["thinking..."], + "content": ["step 1", "step 2"]}}, + }) + assert r1.messages == [] # reasoning alone produces no message + # Then the assistant message + r2 = p.project({ + "method": "item/completed", + "params": {"item": {"type": "agentMessage", "id": "a1", + "text": "ok"}}, + }) + assistant = r2.messages[0] + assert "reasoning" in assistant + assert "thinking" in assistant["reasoning"] + assert "step 1" in assistant["reasoning"] + + def test_reasoning_consumed_after_attaching(self) -> None: + p = CodexEventProjector() + p.project({"method": "item/completed", "params": {"item": { + "type": "reasoning", "id": "r1", "summary": ["once"], "content": []}}}) + first = p.project({"method": "item/completed", "params": {"item": { + "type": "agentMessage", "id": "a", "text": "first"}}}).messages[0] + second = p.project({"method": "item/completed", "params": {"item": { + "type": "agentMessage", "id": "b", "text": "second"}}}).messages[0] + assert "reasoning" in first + assert "reasoning" not in second + + +class TestFileChangeProjection: + def test_file_change_summary_no_inlined_content(self) -> None: + item = { + "type": "fileChange", + "id": "fc1", + "status": "applied", + "changes": [ + {"kind": {"type": "add"}, "path": "/tmp/new.py"}, + {"kind": {"type": "update"}, "path": "/tmp/old.py"}, + ], + } + p = CodexEventProjector() + msgs = p.project({"method": "item/completed", + "params": {"item": item}}).messages + assert len(msgs) == 2 + tc = msgs[0]["tool_calls"][0] + assert tc["function"]["name"] == "apply_patch" + args = json.loads(tc["function"]["arguments"]) + assert len(args["changes"]) == 2 + assert all("kind" in c and "path" in c for c in args["changes"]) + assert "applied" in msgs[1]["content"] + + +class TestMcpToolCallProjection: + def test_mcp_tool_call_namespaced(self) -> None: + item = { + "type": "mcpToolCall", + "id": "m1", + "server": "obsidian", + "tool": "search_notes", + "status": "completed", + "arguments": {"query": "hermes"}, + "result": {"content": [{"text": "found"}]}, + "error": None, + } + msgs = CodexEventProjector().project( + {"method": "item/completed", "params": {"item": item}} + ).messages + assert msgs[0]["tool_calls"][0]["function"]["name"] == "mcp.obsidian.search_notes" + assert "found" in msgs[1]["content"] + + def test_mcp_error_surfaced(self) -> None: + item = { + "type": "mcpToolCall", "id": "m2", + "server": "x", "tool": "y", "status": "failed", + "arguments": {}, "result": None, + "error": {"code": -1, "message": "no"}, + } + msgs = CodexEventProjector().project( + {"method": "item/completed", "params": {"item": item}} + ).messages + assert "error" in msgs[1]["content"] + + +class TestUserAndOpaqueProjection: + def test_user_message_text_fragments_only(self) -> None: + item = { + "type": "userMessage", "id": "u1", + "content": [ + {"type": "text", "text": "hello"}, + {"type": "image", "url": "http://x/y"}, + {"type": "text", "text": "world"}, + ], + } + msgs = CodexEventProjector().project( + {"method": "item/completed", "params": {"item": item}} + ).messages + assert msgs[0]["role"] == "user" + assert "hello" in msgs[0]["content"] + assert "world" in msgs[0]["content"] + + def test_opaque_item_recorded_without_fabricated_tool_calls(self) -> None: + item = {"type": "plan", "id": "p1", "text": "do the thing"} + msgs = CodexEventProjector().project( + {"method": "item/completed", "params": {"item": item}} + ).messages + assert len(msgs) == 1 + assert msgs[0]["role"] == "assistant" + assert "plan" in msgs[0]["content"].lower() + assert "tool_calls" not in msgs[0] + + +class TestHelpers: + def test_deterministic_call_id_stable(self) -> None: + assert _deterministic_call_id("exec", "abc") == _deterministic_call_id("exec", "abc") + assert _deterministic_call_id("exec", "abc") != _deterministic_call_id("exec", "xyz") + + def test_deterministic_call_id_handles_missing_id(self) -> None: + # Should not raise, should be stable for same item type + a = _deterministic_call_id("exec", "") + b = _deterministic_call_id("exec", "") + assert a == b + assert "exec" in a + + def test_format_tool_args_sorted_keys(self) -> None: + # Sorted keys = deterministic across replays = prefix cache stays valid + a = _format_tool_args({"b": 1, "a": 2}) + b = _format_tool_args({"a": 2, "b": 1}) + assert a == b + + +class TestRoleAlternationInvariant: + """The project must never emit two assistant messages back-to-back from + one item — that breaks Hermes' message alternation invariant.""" + + @pytest.mark.parametrize( + "item", + [ + {"type": "commandExecution", "id": "c1", "command": "x", + "cwd": "/", "status": "completed", "aggregatedOutput": "", + "exitCode": 0, "commandActions": []}, + {"type": "fileChange", "id": "f1", "status": "applied", + "changes": []}, + {"type": "mcpToolCall", "id": "m1", "server": "s", "tool": "t", + "status": "completed", "arguments": {}, "result": None, + "error": None}, + {"type": "dynamicToolCall", "id": "d1", "tool": "x", + "arguments": {}, "status": "completed", + "contentItems": [], "success": True}, + ], + ) + def test_tool_items_emit_assistant_then_tool(self, item) -> None: + msgs = CodexEventProjector().project( + {"method": "item/completed", "params": {"item": item}} + ).messages + assert len(msgs) == 2 + assert msgs[0]["role"] == "assistant" + assert msgs[1]["role"] == "tool" + assert msgs[1]["tool_call_id"] == msgs[0]["tool_calls"][0]["id"] diff --git a/tests/agent/transports/test_hermes_tools_mcp_server.py b/tests/agent/transports/test_hermes_tools_mcp_server.py new file mode 100644 index 00000000000..3c11cb3f81d --- /dev/null +++ b/tests/agent/transports/test_hermes_tools_mcp_server.py @@ -0,0 +1,135 @@ +"""Tests for the hermes-tools-as-MCP server module surface. + +We don't run a live MCP session in unit tests — that requires the codex +subprocess + client + an event loop. These tests pin the static +contract: the module imports, the EXPOSED_TOOLS list is sane, and the +build helper assembles a server when the SDK is present. +""" + +from __future__ import annotations + +from unittest.mock import patch + +import pytest + + +class TestModuleSurface: + def test_module_imports_clean(self): + from agent.transports import hermes_tools_mcp_server as m + assert callable(m.main) + assert callable(m._build_server) + assert isinstance(m.EXPOSED_TOOLS, tuple) + assert len(m.EXPOSED_TOOLS) > 0 + + def test_exposed_tools_are_safe_subset(self): + """We MUST NOT expose tools codex already has, because codex' + own builtins are better-integrated with its sandbox + approvals. + Specifically: no terminal/shell, no read_file/write_file, no + patch — those are codex's built-in tools.""" + from agent.transports.hermes_tools_mcp_server import EXPOSED_TOOLS + forbidden = { + "terminal", "shell", "read_file", "write_file", "patch", + "search_files", "process", + } + leaked = forbidden & set(EXPOSED_TOOLS) + assert not leaked, ( + f"these tools must NOT be exposed via the codex callback " + f"because codex has built-in equivalents: {leaked}" + ) + + def test_expected_hermes_specific_tools_listed(self): + """The Hermes-specific tools should be present so users on the + codex runtime keep access to them.""" + from agent.transports.hermes_tools_mcp_server import EXPOSED_TOOLS + for required in ( + "web_search", + "web_extract", + "browser_navigate", + "vision_analyze", + "image_generate", + "skill_view", + ): + assert required in EXPOSED_TOOLS, f"missing {required!r}" + + def test_agent_loop_tools_not_exposed(self): + """delegate_task / memory / session_search / todo require the + running AIAgent context to dispatch, so a stateless MCP callback + can't drive them. They must NOT be in EXPOSED_TOOLS.""" + from agent.transports.hermes_tools_mcp_server import EXPOSED_TOOLS + for agent_loop_tool in ("delegate_task", "memory", "session_search", "todo"): + assert agent_loop_tool not in EXPOSED_TOOLS, ( + f"{agent_loop_tool!r} requires the agent loop context " + "and can't be reached through a stateless MCP callback" + ) + + def test_kanban_worker_tools_exposed(self): + """Kanban workers run as `hermes chat -q` subprocesses; if they + come up on the codex_app_server runtime, the worker can do the + actual work via codex's shell but needs the kanban tools through + the MCP callback to report back to the kernel. Without these + tools available, the worker would hang at completion time.""" + from agent.transports.hermes_tools_mcp_server import EXPOSED_TOOLS + # Worker handoff tools — every dispatched worker uses at least + # one of {complete, block, comment} to close out its task. + for worker_tool in ( + "kanban_complete", + "kanban_block", + "kanban_comment", + "kanban_heartbeat", + ): + assert worker_tool in EXPOSED_TOOLS, ( + f"{worker_tool!r} missing from codex callback — kanban " + "workers on codex_app_server runtime would hang" + ) + + def test_kanban_orchestrator_tools_exposed(self): + """Orchestrator agents need to dispatch new tasks, query the + board, and unblock/link tasks. Exposed so an orchestrator on + codex_app_server can do its job.""" + from agent.transports.hermes_tools_mcp_server import EXPOSED_TOOLS + for orch_tool in ( + "kanban_create", + "kanban_show", + "kanban_list", + "kanban_unblock", + "kanban_link", + ): + assert orch_tool in EXPOSED_TOOLS, ( + f"{orch_tool!r} missing from codex callback" + ) + + +class TestMain: + def test_main_returns_2_when_mcp_unavailable(self, monkeypatch): + """When the mcp package isn't installed, main() should exit + cleanly with code 2 and an install hint, not crash.""" + import agent.transports.hermes_tools_mcp_server as m + + def boom_build(*a, **kw): + raise ImportError("mcp not installed") + + monkeypatch.setattr(m, "_build_server", boom_build) + rc = m.main(["--verbose"]) + assert rc == 2 + + def test_main_handles_keyboard_interrupt(self, monkeypatch): + import agent.transports.hermes_tools_mcp_server as m + + class FakeServer: + def run(self): + raise KeyboardInterrupt() + + monkeypatch.setattr(m, "_build_server", lambda: FakeServer()) + rc = m.main([]) + assert rc == 0 + + def test_main_returns_1_on_runtime_error(self, monkeypatch): + import agent.transports.hermes_tools_mcp_server as m + + class CrashingServer: + def run(self): + raise RuntimeError("boom") + + monkeypatch.setattr(m, "_build_server", lambda: CrashingServer()) + rc = m.main([]) + assert rc == 1 diff --git a/tests/hermes_cli/test_codex_runtime_plugin_migration.py b/tests/hermes_cli/test_codex_runtime_plugin_migration.py new file mode 100644 index 00000000000..0274251327c --- /dev/null +++ b/tests/hermes_cli/test_codex_runtime_plugin_migration.py @@ -0,0 +1,589 @@ +"""Tests for the codex MCP plugin migration helper.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from hermes_cli.codex_runtime_plugin_migration import ( + MIGRATION_MARKER, + MigrationReport, + _format_toml_value, + _strip_existing_managed_block, + _translate_one_server, + migrate, + render_codex_toml_section, +) + + +# ---- per-server translation ---- + +class TestTranslateOneServer: + def test_stdio_basic(self): + cfg, skipped = _translate_one_server("filesystem", { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"], + "env": {"FOO": "bar"}, + }) + assert cfg == { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"], + "env": {"FOO": "bar"}, + } + assert skipped == [] + + def test_stdio_with_cwd(self): + cfg, _ = _translate_one_server("custom", { + "command": "/usr/bin/myserver", + "cwd": "/var/lib/mcp", + }) + assert cfg["cwd"] == "/var/lib/mcp" + + def test_http_basic(self): + cfg, skipped = _translate_one_server("api", { + "url": "https://x.example/mcp", + "headers": {"Authorization": "Bearer abc"}, + }) + assert cfg == { + "url": "https://x.example/mcp", + "http_headers": {"Authorization": "Bearer abc"}, + } + assert skipped == [] + + def test_sse_falls_under_streamable_http_with_warning(self): + cfg, skipped = _translate_one_server("sse_server", { + "url": "http://localhost:8000/sse", + "transport": "sse", + }) + assert cfg["url"] == "http://localhost:8000/sse" + assert any("sse" in s.lower() for s in skipped) + + def test_timeouts_translate(self): + cfg, _ = _translate_one_server("x", { + "command": "y", + "timeout": 180, + "connect_timeout": 30, + }) + assert cfg["tool_timeout_sec"] == 180.0 + assert cfg["startup_timeout_sec"] == 30.0 + + def test_non_numeric_timeout_skipped(self): + cfg, skipped = _translate_one_server("x", { + "command": "y", + "timeout": "not-a-number", + }) + assert "tool_timeout_sec" not in cfg + assert any("timeout" in s and "numeric" in s for s in skipped) + + def test_disabled_server_emits_enabled_false(self): + cfg, _ = _translate_one_server("x", { + "command": "y", + "enabled": False, + }) + assert cfg["enabled"] is False + + def test_enabled_true_omitted(self): + cfg, _ = _translate_one_server("x", {"command": "y", "enabled": True}) + assert "enabled" not in cfg # codex defaults to true + + def test_command_and_url_prefers_stdio_warns(self): + cfg, skipped = _translate_one_server("x", { + "command": "y", "url": "http://z", + }) + assert "command" in cfg + assert "url" not in cfg + assert any("url" in s for s in skipped) + + def test_no_transport_returns_none(self): + cfg, skipped = _translate_one_server("broken", {"description": "x"}) + assert cfg is None + assert "no command or url" in skipped[0] + + def test_sampling_dropped_with_warning(self): + cfg, skipped = _translate_one_server("x", { + "command": "y", + "sampling": {"enabled": True, "model": "gemini-3-flash"}, + }) + assert "sampling" not in cfg + assert any("sampling" in s for s in skipped) + + def test_unknown_keys_warned(self): + cfg, skipped = _translate_one_server("x", { + "command": "y", + "totally_made_up_key": "value", + }) + assert "totally_made_up_key" not in cfg + assert any("totally_made_up_key" in s for s in skipped) + + def test_non_dict_input(self): + cfg, skipped = _translate_one_server("x", "notadict") # type: ignore[arg-type] + assert cfg is None + + +# ---- TOML rendering ---- + +class TestTomlValueFormatter: + def test_string_quoted(self): + assert _format_toml_value("hello") == '"hello"' + + def test_string_with_quotes_escaped(self): + assert _format_toml_value('a"b') == '"a\\"b"' + + def test_bool(self): + assert _format_toml_value(True) == "true" + assert _format_toml_value(False) == "false" + + def test_int(self): + assert _format_toml_value(42) == "42" + + def test_float(self): + assert _format_toml_value(180.0) == "180.0" + + def test_list_of_strings(self): + assert _format_toml_value(["a", "b"]) == '["a", "b"]' + + def test_inline_table(self): + out = _format_toml_value({"FOO": "bar"}) + assert out == '{ FOO = "bar" }' + + def test_empty_inline_table(self): + assert _format_toml_value({}) == "{}" + + def test_string_with_newline_escaped(self): + """TOML basic strings don't allow literal newlines — a path or + env var containing a newline must use \\n. Otherwise codex would + refuse to load the config.""" + out = _format_toml_value("line one\nline two") + assert "\n" not in out # no raw newline in output + assert "\\n" in out + + def test_string_with_tab_escaped(self): + out = _format_toml_value("col1\tcol2") + assert "\t" not in out + assert "\\t" in out + + def test_string_with_other_controls_escaped(self): + for raw, expected in [ + ("\r", "\\r"), + ("\f", "\\f"), + ("\b", "\\b"), + ]: + out = _format_toml_value(f"x{raw}y") + assert raw not in out, f"{raw!r} should be escaped" + assert expected in out, f"{expected!r} should be in output" + + def test_windows_path_escaped_correctly(self): + out = _format_toml_value(r"C:\Users\Alice\.codex") + # Each backslash should be doubled + assert out == r'"C:\\Users\\Alice\\.codex"' + + def test_atomic_write_no_temp_leak_on_success(self, tmp_path): + """The atomic-write path uses tempfile.mkstemp + rename. On + success the temp file should not be left behind.""" + migrate({"mcp_servers": {"x": {"command": "y"}}}, + codex_home=tmp_path, + discover_plugins=False, + expose_hermes_tools=False, + default_permission_profile=None) + # config.toml should exist + assert (tmp_path / "config.toml").exists() + # And no .config.toml.* temp files left behind + leftover = [p.name for p in tmp_path.iterdir() + if p.name.startswith(".config.toml.")] + assert leftover == [], f"temp file leaked after migration: {leftover}" + + def test_atomic_write_cleanup_on_rename_failure(self, tmp_path, monkeypatch): + """If rename fails partway through (out of disk, permissions, + crash), the temp file must be cleaned up. Otherwise repeated + failed migrations would pile up .config.toml.* files.""" + from pathlib import Path as _Path + original_replace = _Path.replace + + def failing_replace(self, target): + raise OSError("simulated disk full") + + monkeypatch.setattr(_Path, "replace", failing_replace) + report = migrate( + {"mcp_servers": {"x": {"command": "y"}}}, + codex_home=tmp_path, + discover_plugins=False, + expose_hermes_tools=False, + default_permission_profile=None, + ) + # Error surfaced + assert any("simulated disk full" in e for e in report.errors) + # And no leaked temp file + leftover = [p.name for p in tmp_path.iterdir() + if p.name.startswith(".config.toml.")] + assert leftover == [], f"temp files leaked: {leftover}" + + def test_unsupported_type_raises(self): + with pytest.raises(ValueError): + _format_toml_value(object()) + + +class TestRenderToml: + def test_starts_with_marker(self): + out = render_codex_toml_section({}) + assert out.startswith(MIGRATION_MARKER) + + def test_empty_servers_emits_placeholder(self): + out = render_codex_toml_section({}) + assert "no MCP servers" in out + + def test_servers_sorted_alphabetically(self): + out = render_codex_toml_section({ + "zoo": {"command": "z"}, + "alpha": {"command": "a"}, + "middle": {"command": "m"}, + }) + # Find the section header positions and confirm order + a_pos = out.find("[mcp_servers.alpha]") + m_pos = out.find("[mcp_servers.middle]") + z_pos = out.find("[mcp_servers.zoo]") + assert 0 < a_pos < m_pos < z_pos + + def test_server_with_args_and_env(self): + out = render_codex_toml_section({ + "fs": { + "command": "npx", + "args": ["-y", "filesystem"], + "env": {"PATH": "/usr/bin"}, + } + }) + assert "[mcp_servers.fs]" in out + assert 'command = "npx"' in out + assert 'args = ["-y", "filesystem"]' in out + # Env emitted as inline table + assert 'env = { PATH = "/usr/bin" }' in out + + +# ---- existing-block stripping ---- + +class TestStripExistingManagedBlock: + def test_no_managed_block_unchanged(self): + text = "[other]\nfoo = 1\n" + assert _strip_existing_managed_block(text) == text + + def test_strips_managed_block_alone(self): + text = ( + f"{MIGRATION_MARKER}\n" + "\n" + "[mcp_servers.fs]\n" + 'command = "npx"\n' + ) + assert _strip_existing_managed_block(text).strip() == "" + + def test_preserves_user_content_above_managed_block(self): + text = ( + "[model]\n" + 'name = "gpt-5.5"\n' + "\n" + f"{MIGRATION_MARKER}\n" + "[mcp_servers.fs]\n" + 'command = "x"\n' + ) + out = _strip_existing_managed_block(text) + assert "[model]" in out + assert 'name = "gpt-5.5"' in out + assert "mcp_servers.fs" not in out + + def test_preserves_unrelated_section_after_managed_block(self): + text = ( + f"{MIGRATION_MARKER}\n" + "[mcp_servers.fs]\n" + 'command = "x"\n' + "\n" + "[providers]\n" + 'foo = "bar"\n' + ) + out = _strip_existing_managed_block(text) + assert "mcp_servers.fs" not in out + assert "[providers]" in out + assert 'foo = "bar"' in out + + +# ---- end-to-end migrate(, expose_hermes_tools=False) ---- + +class TestMigrate: + def test_no_servers_no_plugins_no_perms_writes_placeholder(self, tmp_path): + report = migrate({}, codex_home=tmp_path, + discover_plugins=False, + default_permission_profile=None, expose_hermes_tools=False) + assert report.written + text = (tmp_path / "config.toml").read_text() + assert MIGRATION_MARKER in text + assert "no MCP servers" in text or "no MCP servers, plugins, or permissions" in text + + def test_no_servers_still_writes_permissions_default(self, tmp_path): + """Even with zero MCP servers, enabling the runtime should write the + default permissions profile so users don't get prompted on every + write attempt. This is the fix for quirk #2.""" + report = migrate({}, codex_home=tmp_path, discover_plugins=False, expose_hermes_tools=False) + assert report.written + text = (tmp_path / "config.toml").read_text() + # Codex's schema: top-level `default_permissions` keying a built-in + # profile name (prefixed with ":"). NOT a [permissions] section + # (which is for *user-defined* profiles with structured fields). + assert 'default_permissions = ":workspace"' in text + assert report.wrote_permissions_default == ":workspace" + + def test_explicit_none_permissions_skips_block(self, tmp_path): + report = migrate({"mcp_servers": {"x": {"command": "y"}}}, + codex_home=tmp_path, + discover_plugins=False, + default_permission_profile=None, expose_hermes_tools=False) + text = (tmp_path / "config.toml").read_text() + assert "default_permissions" not in text + assert "[permissions]" not in text + assert report.wrote_permissions_default is None + + def test_plugin_discovery_writes_plugin_blocks(self, tmp_path, monkeypatch): + """Discovered curated plugins land as [plugins."@"] + blocks. This is what OpenClaw calls 'migrate native codex plugins.'""" + from hermes_cli import codex_runtime_plugin_migration as crpm + + def fake_query(codex_home=None, timeout=8.0): + return [ + {"name": "google-calendar", "marketplace": "openai-curated", + "enabled": True}, + {"name": "github", "marketplace": "openai-curated", + "enabled": True}, + ], None + monkeypatch.setattr(crpm, "_query_codex_plugins", fake_query) + + report = migrate({}, codex_home=tmp_path, discover_plugins=True, expose_hermes_tools=False) + text = (tmp_path / "config.toml").read_text() + assert '[plugins."github@openai-curated"]' in text + assert '[plugins."google-calendar@openai-curated"]' in text + assert "enabled = true" in text + assert "google-calendar@openai-curated" in report.migrated_plugins + assert "github@openai-curated" in report.migrated_plugins + + def test_plugin_discovery_failure_non_fatal(self, tmp_path, monkeypatch): + """If codex isn't installed or RPC fails, MCP migration still + completes. The error surfaces in the report but doesn't abort.""" + from hermes_cli import codex_runtime_plugin_migration as crpm + + def fake_query_fails(codex_home=None, timeout=8.0): + return [], "codex CLI not available" + monkeypatch.setattr(crpm, "_query_codex_plugins", fake_query_fails) + + report = migrate({"mcp_servers": {"x": {"command": "y"}}}, + codex_home=tmp_path, discover_plugins=True, expose_hermes_tools=False) + assert report.written + assert report.migrated == ["x"] + assert report.plugin_query_error == "codex CLI not available" + assert report.migrated_plugins == [] + + def test_discover_plugins_false_skips_query(self, tmp_path, monkeypatch): + """Tests and restricted environments can opt out of the subprocess + spawn entirely.""" + from hermes_cli import codex_runtime_plugin_migration as crpm + + called = {"yes": False} + def boom(*a, **kw): + called["yes"] = True + return [], None + monkeypatch.setattr(crpm, "_query_codex_plugins", boom) + + migrate({"mcp_servers": {"x": {"command": "y"}}}, + codex_home=tmp_path, discover_plugins=False, expose_hermes_tools=False) + assert called["yes"] is False + + def test_dry_run_skips_plugin_query(self, tmp_path, monkeypatch): + """Dry run should never spawn codex. Even with discover_plugins=True + the query is skipped because dry_run takes precedence.""" + from hermes_cli import codex_runtime_plugin_migration as crpm + + called = {"yes": False} + def boom(*a, **kw): + called["yes"] = True + return [], None + monkeypatch.setattr(crpm, "_query_codex_plugins", boom) + + migrate({"mcp_servers": {"x": {"command": "y"}}}, + codex_home=tmp_path, dry_run=True, discover_plugins=True, expose_hermes_tools=False) + assert called["yes"] is False + + def test_re_run_replaces_plugin_block(self, tmp_path, monkeypatch): + """Plugin blocks are managed and re-runs should replace them + cleanly — same idempotency contract as MCP servers.""" + from hermes_cli import codex_runtime_plugin_migration as crpm + + # First run: only github + monkeypatch.setattr(crpm, "_query_codex_plugins", + lambda codex_home=None, timeout=8.0: ( + [{"name": "github", "marketplace": "openai-curated", "enabled": True}], + None, + )) + migrate({}, codex_home=tmp_path, discover_plugins=True, + default_permission_profile=None, expose_hermes_tools=False) + first = (tmp_path / "config.toml").read_text() + assert "github@openai-curated" in first + + # Second run: only canva (github went away) + monkeypatch.setattr(crpm, "_query_codex_plugins", + lambda codex_home=None, timeout=8.0: ( + [{"name": "canva", "marketplace": "openai-curated", "enabled": True}], + None, + )) + migrate({}, codex_home=tmp_path, discover_plugins=True, + default_permission_profile=None, expose_hermes_tools=False) + second = (tmp_path / "config.toml").read_text() + assert "github@openai-curated" not in second + assert "canva@openai-curated" in second + + def test_expose_hermes_tools_writes_callback_mcp_entry(self, tmp_path): + """When expose_hermes_tools=True (production default), an + [mcp_servers.hermes-tools] entry is written so codex calls back + into Hermes for browser/web/delegate_task/vision/memory tools. + + This is the fix for 'all other tools that codex doesn't provide + should be useable by hermes' — quirk #7.""" + report = migrate({}, codex_home=tmp_path, + discover_plugins=False, + default_permission_profile=None, + expose_hermes_tools=True) + text = (tmp_path / "config.toml").read_text() + assert "[mcp_servers.hermes-tools]" in text + assert "hermes_tools_mcp_server" in text + # Must include startup + tool timeouts so codex doesn't give up + assert "startup_timeout_sec" in text + assert "tool_timeout_sec" in text + # And the entry is reported + assert "hermes-tools" in report.migrated + + def test_expose_hermes_tools_disabled_skips_entry(self, tmp_path): + """expose_hermes_tools=False suppresses the callback registration.""" + migrate({}, codex_home=tmp_path, + discover_plugins=False, + default_permission_profile=None, + expose_hermes_tools=False) + text = (tmp_path / "config.toml").read_text() + assert "[mcp_servers.hermes-tools]" not in text + assert "hermes_tools_mcp_server" not in text + + def test_dry_run_doesnt_write(self, tmp_path): + report = migrate({"mcp_servers": {"x": {"command": "y"}}}, + codex_home=tmp_path, dry_run=True, expose_hermes_tools=False) + assert report.dry_run is True + assert not (tmp_path / "config.toml").exists() + assert "x" in report.migrated + + def test_full_migration_round_trip(self, tmp_path): + hermes_cfg = { + "mcp_servers": { + "filesystem": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem"], + }, + "github": { + "url": "https://api.github.com/mcp", + "headers": {"Authorization": "Bearer x"}, + }, + } + } + report = migrate(hermes_cfg, codex_home=tmp_path, expose_hermes_tools=False) + assert report.written + text = (tmp_path / "config.toml").read_text() + assert "[mcp_servers.filesystem]" in text + assert "[mcp_servers.github]" in text + assert 'command = "npx"' in text + assert 'url = "https://api.github.com/mcp"' in text + + def test_idempotent_re_run_replaces_managed_block(self, tmp_path): + # First migration + migrate({"mcp_servers": {"a": {"command": "x"}}}, codex_home=tmp_path, expose_hermes_tools=False) + first_text = (tmp_path / "config.toml").read_text() + assert "[mcp_servers.a]" in first_text + # Second migration with different servers + migrate({"mcp_servers": {"b": {"command": "y"}}}, codex_home=tmp_path, expose_hermes_tools=False) + second_text = (tmp_path / "config.toml").read_text() + assert "[mcp_servers.a]" not in second_text + assert "[mcp_servers.b]" in second_text + + def test_preserves_user_codex_config_above_marker(self, tmp_path): + target = tmp_path / "config.toml" + target.write_text( + "[model]\n" + 'profile = "default"\n' + "\n" + "[providers.openai]\n" + 'api_key = "sk-test"\n' + ) + migrate({"mcp_servers": {"a": {"command": "x"}}}, codex_home=tmp_path, expose_hermes_tools=False) + new_text = target.read_text() + # User's codex config preserved + assert "[model]" in new_text + assert 'profile = "default"' in new_text + assert "[providers.openai]" in new_text + # And new MCP block appended + assert "[mcp_servers.a]" in new_text + assert MIGRATION_MARKER in new_text + + def test_preserves_user_mcp_server_outside_managed_block(self, tmp_path): + """Quirk #6: when a user adds their own MCP server entry directly + to ~/.codex/config.toml outside Hermes' managed block, re-running + migration must preserve it. Tested both above and below the + managed block.""" + target = tmp_path / "config.toml" + target.write_text( + "[mcp_servers.user-above]\n" + 'command = "/usr/bin/above-server"\n' + 'args = ["--above"]\n' + ) + # First migrate — adds managed block below user content + migrate({"mcp_servers": {"hermes-mcp": {"command": "npx"}}}, + codex_home=tmp_path, discover_plugins=False, + expose_hermes_tools=False) + text = target.read_text() + assert "user-above" in text, "user MCP server above managed block got nuked" + assert 'command = "/usr/bin/above-server"' in text + + # Append another user entry below the managed block + target.write_text( + text + "\n[mcp_servers.user-below]\ncommand = \"below-server\"\n" + ) + # Re-migrate — both should survive + migrate({"mcp_servers": {"hermes-mcp": {"command": "npx"}}}, + codex_home=tmp_path, discover_plugins=False, + expose_hermes_tools=False) + final = target.read_text() + assert "user-above" in final + assert "user-below" in final + # And our managed block is still there with the new content + assert "[mcp_servers.hermes-mcp]" in final + + def test_skipped_keys_reported(self, tmp_path): + report = migrate({ + "mcp_servers": { + "x": { + "command": "y", + "sampling": {"enabled": True}, # codex has no equivalent + } + } + }, codex_home=tmp_path, expose_hermes_tools=False) + assert "x" in report.skipped_keys_per_server + assert any("sampling" in s for s in report.skipped_keys_per_server["x"]) + + def test_invalid_mcp_servers_value(self, tmp_path): + report = migrate({"mcp_servers": "notadict"}, codex_home=tmp_path, expose_hermes_tools=False) + assert any("not a dict" in e for e in report.errors) + + def test_server_without_transport_skipped_with_error(self, tmp_path): + report = migrate({ + "mcp_servers": {"broken": {"description": "no command/url"}} + }, codex_home=tmp_path, expose_hermes_tools=False) + assert "broken" not in report.migrated + assert any("broken" in e for e in report.errors) + + def test_summary_reports_migration_count(self, tmp_path): + report = migrate({ + "mcp_servers": {"a": {"command": "x"}, "b": {"command": "y"}} + }, codex_home=tmp_path, expose_hermes_tools=False) + summary = report.summary() + assert "Migrated 2 MCP server(s)" in summary + assert "- a" in summary + assert "- b" in summary diff --git a/tests/hermes_cli/test_codex_runtime_switch.py b/tests/hermes_cli/test_codex_runtime_switch.py new file mode 100644 index 00000000000..9a01543776e --- /dev/null +++ b/tests/hermes_cli/test_codex_runtime_switch.py @@ -0,0 +1,231 @@ +"""Tests for the /codex-runtime slash-command shared logic. + +These cover the pure-Python state machine; CLI and gateway handlers are +tested separately because they involve config persistence and prompt +formatting that's surface-specific.""" + +from __future__ import annotations + +from unittest.mock import patch + +import pytest + +from hermes_cli import codex_runtime_switch as crs + + +class TestParseArgs: + @pytest.mark.parametrize("arg,expected", [ + ("", None), + (" ", None), + ("auto", "auto"), + ("codex_app_server", "codex_app_server"), + ("on", "codex_app_server"), + ("off", "auto"), + ("codex", "codex_app_server"), + ("default", "auto"), + ("hermes", "auto"), + ("ENABLE", "codex_app_server"), # case-insensitive + ("DiSaBlE", "auto"), + ]) + def test_valid_args(self, arg, expected): + value, errors = crs.parse_args(arg) + assert errors == [] + assert value == expected + + def test_invalid_arg_returns_error(self): + value, errors = crs.parse_args("turbo") + assert value is None + assert errors and "Unknown runtime" in errors[0] + + +class TestGetCurrentRuntime: + def test_default_when_unset(self): + assert crs.get_current_runtime({}) == "auto" + assert crs.get_current_runtime({"model": {}}) == "auto" + assert crs.get_current_runtime({"model": {"openai_runtime": ""}}) == "auto" + + def test_unrecognized_falls_back_to_auto(self): + assert crs.get_current_runtime( + {"model": {"openai_runtime": "garbage"}} + ) == "auto" + + def test_explicit_codex(self): + assert crs.get_current_runtime( + {"model": {"openai_runtime": "codex_app_server"}} + ) == "codex_app_server" + + def test_handles_non_dict_config(self): + assert crs.get_current_runtime(None) == "auto" # type: ignore[arg-type] + assert crs.get_current_runtime("notadict") == "auto" # type: ignore[arg-type] + assert crs.get_current_runtime({"model": "notadict"}) == "auto" + + +class TestSetRuntime: + def test_creates_model_section_if_missing(self): + cfg = {} + old = crs.set_runtime(cfg, "codex_app_server") + assert old == "auto" + assert cfg["model"]["openai_runtime"] == "codex_app_server" + + def test_returns_previous_value(self): + cfg = {"model": {"openai_runtime": "codex_app_server"}} + old = crs.set_runtime(cfg, "auto") + assert old == "codex_app_server" + assert cfg["model"]["openai_runtime"] == "auto" + + def test_invalid_value_raises(self): + with pytest.raises(ValueError): + crs.set_runtime({}, "garbage") + + +class TestApply: + def test_read_only_call_reports_state(self): + cfg = {"model": {"openai_runtime": "codex_app_server"}} + with patch.object(crs, "check_codex_binary_ok", + return_value=(True, "0.130.0")): + r = crs.apply(cfg, None) + assert r.success + assert r.new_value == "codex_app_server" + assert r.old_value == "codex_app_server" + assert "codex_app_server" in r.message + assert "0.130.0" in r.message + + def test_no_change_when_already_set(self): + cfg = {"model": {"openai_runtime": "auto"}} + r = crs.apply(cfg, "auto") + assert r.success + assert r.message == "openai_runtime already set to auto" + + def test_enable_blocked_when_codex_missing(self): + cfg = {} + with patch.object(crs, "check_codex_binary_ok", + return_value=(False, "codex not found")): + r = crs.apply(cfg, "codex_app_server") + assert r.success is False + assert "Cannot enable" in r.message + assert "npm i -g @openai/codex" in r.message + # Config NOT mutated on failure + assert cfg.get("model", {}).get("openai_runtime") in (None, "") + + def test_enable_succeeds_when_codex_present(self): + cfg = {} + persisted = {} + + def persist(c): + persisted.update(c) + + with patch.object(crs, "check_codex_binary_ok", + return_value=(True, "0.130.0")): + r = crs.apply(cfg, "codex_app_server", persist_callback=persist) + assert r.success + assert r.new_value == "codex_app_server" + assert r.old_value == "auto" + assert r.requires_new_session is True + assert "via MCP" in r.message # hermes-tools callback message + assert cfg["model"]["openai_runtime"] == "codex_app_server" + assert persisted["model"]["openai_runtime"] == "codex_app_server" + + def test_disable_does_not_check_binary(self): + cfg = {"model": {"openai_runtime": "codex_app_server"}} + with patch.object(crs, "check_codex_binary_ok") as bin_check: + r = crs.apply(cfg, "auto") + assert r.success + # Binary check is irrelevant when disabling — should not be called + # with the codex_app_server enable-gate signature. + assert r.new_value == "auto" + assert r.old_value == "codex_app_server" + + def test_persist_callback_failure_reported(self): + cfg = {} + + def persist_boom(c): + raise IOError("disk full") + + with patch.object(crs, "check_codex_binary_ok", + return_value=(True, "0.130.0")): + r = crs.apply(cfg, "codex_app_server", persist_callback=persist_boom) + assert r.success is False + assert "persist failed" in r.message + assert "disk full" in r.message + + def test_enable_triggers_mcp_migration(self): + """Enabling codex_app_server should auto-migrate Hermes mcp_servers + to ~/.codex/config.toml so the spawned subprocess sees them.""" + cfg = { + "mcp_servers": { + "filesystem": {"command": "npx", "args": ["-y", "fs-server"]}, + } + } + + with patch.object(crs, "check_codex_binary_ok", + return_value=(True, "0.130.0")), \ + patch("hermes_cli.codex_runtime_plugin_migration.migrate") as mig: + mig.return_value.migrated = ["filesystem", "hermes-tools"] + mig.return_value.migrated_plugins = [] + mig.return_value.plugin_query_error = None + mig.return_value.wrote_permissions_default = ":workspace" + mig.return_value.errors = [] + mig.return_value.target_path = "/fake/.codex/config.toml" + r = crs.apply(cfg, "codex_app_server") + assert r.success + assert mig.called # migration was triggered + # User MCP servers are reported (excluding internal hermes-tools) + assert "Migrated 1 MCP server" in r.message + assert "filesystem" in r.message + # Permissions default surfaces + assert "Default sandbox: :workspace" in r.message + # Hermes tool callback announcement + assert "via MCP" in r.message + + def test_disable_does_not_trigger_migration(self): + """Switching back to auto must not write to ~/.codex/.""" + cfg = { + "model": {"openai_runtime": "codex_app_server"}, + "mcp_servers": {"x": {"command": "y"}}, + } + with patch("hermes_cli.codex_runtime_plugin_migration.migrate") as mig: + r = crs.apply(cfg, "auto") + assert r.success + assert not mig.called # disabling does not migrate + + def test_migration_failure_does_not_block_enable(self): + """If MCP migration raises, the runtime change still proceeds — + users can manually re-run migration later.""" + cfg = {"mcp_servers": {"x": {"command": "y"}}} + with patch.object(crs, "check_codex_binary_ok", + return_value=(True, "0.130.0")), \ + patch("hermes_cli.codex_runtime_plugin_migration.migrate", + side_effect=RuntimeError("disk full")): + r = crs.apply(cfg, "codex_app_server") + assert r.success # change still applied + assert r.new_value == "codex_app_server" + assert "MCP migration skipped" in r.message + assert "disk full" in r.message + + def test_binary_check_cached_within_apply(self): + """check_codex_binary_ok is invoked at most once per apply() call. + + The enable path has three sites that need the version (state report, + enable gate, success message). Without caching, a single + /codex-runtime invocation spawns `codex --version` three times. + Regression guard against a refactor that drops the cache. + """ + cfg = {} + with patch.object(crs, "check_codex_binary_ok", + return_value=(True, "0.130.0")) as bin_check, \ + patch("hermes_cli.codex_runtime_plugin_migration.migrate"): + r = crs.apply(cfg, "codex_app_server") + assert r.success + assert bin_check.call_count == 1, ( + f"check_codex_binary_ok was called {bin_check.call_count} time(s); " + "should be cached and called exactly once per apply()" + ) + + def test_binary_check_cached_on_read_only_call(self): + """Read-only call (new_value=None) calls the binary check exactly + once and reuses the result for the message.""" + cfg = {"model": {"openai_runtime": "codex_app_server"}} + with patch.object(crs, "check_codex_binary_ok", + return_value=(True, "0.130.0")) as bin_check: + crs.apply(cfg, None) + assert bin_check.call_count == 1 diff --git a/tests/run_agent/test_codex_app_server_integration.py b/tests/run_agent/test_codex_app_server_integration.py new file mode 100644 index 00000000000..6fc60695d2a --- /dev/null +++ b/tests/run_agent/test_codex_app_server_integration.py @@ -0,0 +1,344 @@ +"""Integration test for the codex_app_server runtime path through AIAgent. + +Verifies that: + - api_mode='codex_app_server' is accepted on AIAgent construction + - run_conversation() takes the early-return path and never enters the + chat completions loop + - Projected messages from a fake Codex session land in the messages list + - tool_iterations from the codex session tick the skill nudge counter + - Memory nudge counter ticks once per turn + - The returned dict has the same shape as the chat_completions path +""" + +from __future__ import annotations + +from unittest.mock import patch + +import pytest + +import run_agent +from agent.transports.codex_app_server_session import CodexAppServerSession, TurnResult + + +@pytest.fixture +def fake_session(monkeypatch): + """Replace CodexAppServerSession with a stub that returns a fixed + TurnResult, so we can drive AIAgent without spawning real codex.""" + + def fake_run_turn(self, user_input: str, **kwargs): + return TurnResult( + final_text=f"echo: {user_input}", + projected_messages=[ + {"role": "assistant", "content": None, + "tool_calls": [{"id": "exec_1", "type": "function", + "function": {"name": "exec_command", + "arguments": "{}"}}]}, + {"role": "tool", "tool_call_id": "exec_1", "content": "ok"}, + {"role": "assistant", "content": f"echo: {user_input}"}, + ], + tool_iterations=1, + interrupted=False, + error=None, + turn_id="turn-stub-1", + thread_id="thread-stub-1", + ) + + monkeypatch.setattr(CodexAppServerSession, "run_turn", fake_run_turn) + monkeypatch.setattr( + CodexAppServerSession, "ensure_started", lambda self: "thread-stub-1" + ) + + +def _make_codex_agent(): + """Construct an AIAgent in codex_app_server mode without contacting any + real provider. We pass api_mode explicitly so the constructor takes the + fast path for direct credentials.""" + return run_agent.AIAgent( + api_key="stub", + base_url="https://stub.invalid", + provider="openai", + api_mode="codex_app_server", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + ) + + +class TestApiModeAccepted: + def test_api_mode_is_codex_app_server(self): + agent = _make_codex_agent() + assert agent.api_mode == "codex_app_server" + + +class TestRunConversationCodexPath: + def test_run_conversation_returns_codex_shape(self, fake_session): + agent = _make_codex_agent() + # No background review fork during tests + with patch.object(agent, "_spawn_background_review", return_value=None): + result = agent.run_conversation("hello there") + assert result["final_response"] == "echo: hello there" + assert result["completed"] is True + assert result["partial"] is False + assert result["error"] is None + assert result["api_calls"] == 1 + assert result["codex_thread_id"] == "thread-stub-1" + assert result["codex_turn_id"] == "turn-stub-1" + + def test_projected_messages_are_spliced(self, fake_session): + agent = _make_codex_agent() + with patch.object(agent, "_spawn_background_review", return_value=None): + result = agent.run_conversation("hello") + msgs = result["messages"] + # User message + 3 projected (assistant tool_call + tool + assistant text) + assert len(msgs) >= 4 + assert msgs[0]["role"] == "user" + assert msgs[0]["content"] == "hello" + # Last assistant message has the final text + final = [m for m in msgs if m.get("role") == "assistant" + and m.get("content") == "echo: hello"] + assert final, f"expected final assistant message in {msgs}" + + def test_nudge_counters_tick(self, fake_session): + """The skill nudge counter must accumulate tool_iterations across + turns. The memory nudge counter is gated on memory being configured + (which we skip via skip_memory=True), so we don't assert on it here — + a separate test below covers that path explicitly.""" + agent = _make_codex_agent() + agent._iters_since_skill = 0 + agent._user_turn_count = 0 + with patch.object(agent, "_spawn_background_review", return_value=None): + agent.run_conversation("first") + assert agent._iters_since_skill == 1 # one tool_iteration in fake turn + # _user_turn_count is incremented by run_conversation pre-loop, not + # by the codex helper — confirms we delegate that to the standard flow. + assert agent._user_turn_count == 1 + with patch.object(agent, "_spawn_background_review", return_value=None): + agent.run_conversation("second") + assert agent._iters_since_skill == 2 + assert agent._user_turn_count == 2 + + def test_user_message_not_duplicated(self, fake_session): + """Regression guard: the user message must appear exactly once in + the messages list. The standard run_conversation pre-loop appends + it, and the codex helper must NOT append again.""" + agent = _make_codex_agent() + with patch.object(agent, "_spawn_background_review", return_value=None): + result = agent.run_conversation("ping unique 12345") + user_count = sum( + 1 for m in result["messages"] + if m.get("role") == "user" and m.get("content") == "ping unique 12345" + ) + assert user_count == 1, f"user message appeared {user_count}× in {result['messages']}" + + def test_background_review_NOT_invoked_below_threshold(self, fake_session): + """A single turn shouldn't trigger background review — counters + haven't reached the nudge interval (default 10).""" + agent = _make_codex_agent() + agent._memory_nudge_interval = 10 + agent._skill_nudge_interval = 10 + agent._iters_since_skill = 0 + with patch.object(agent, "_spawn_background_review", + return_value=None) as spawn: + agent.run_conversation("ping") + # Below threshold → review should NOT fire (was a real bug: + # the helper was calling _spawn_background_review() with no + # args after every turn, which would crash with TypeError). + assert not spawn.called + + def test_background_review_skill_trigger_fires_above_threshold( + self, monkeypatch + ): + """When tool iterations cross the skill nudge interval, the + background review fires with review_skills=True and the right + messages_snapshot signature.""" + from agent.transports.codex_app_server_session import ( + CodexAppServerSession, TurnResult, + ) + # Make the fake session report 10 tool iterations in one turn + # (matching the default skill threshold). + def fake_run_turn(self, user_input: str, **kwargs): + return TurnResult( + final_text=f"echo: {user_input}", + projected_messages=[ + {"role": "assistant", "content": f"echo: {user_input}"}, + ], + tool_iterations=10, + turn_id="t1", thread_id="th1", + ) + monkeypatch.setattr(CodexAppServerSession, "run_turn", fake_run_turn) + monkeypatch.setattr( + CodexAppServerSession, "ensure_started", lambda self: "th1" + ) + + agent = _make_codex_agent() + agent._skill_nudge_interval = 10 + agent._iters_since_skill = 0 + # Make valid_tool_names include 'skill_manage' so the gate passes + agent.valid_tool_names = set(getattr(agent, "valid_tool_names", set())) + agent.valid_tool_names.add("skill_manage") + + with patch.object(agent, "_spawn_background_review", + return_value=None) as spawn: + agent.run_conversation("do tool work") + + assert spawn.called, "skill threshold tripped but review didn't fire" + # Verify the call signature matches what _spawn_background_review + # actually expects — this is the regression guard for the original + # bug where the codex path called it with no args at all. + call = spawn.call_args + assert "messages_snapshot" in call.kwargs + assert isinstance(call.kwargs["messages_snapshot"], list) + assert call.kwargs["review_skills"] is True + # Counter should be reset after the review fires + assert agent._iters_since_skill == 0 + + def test_background_review_signature_never_breaks(self, fake_session): + """Even when no trigger fires, the helper must never call + _spawn_background_review with the wrong signature. Run a turn, + then run another turn after manually tripping the skill counter + and confirm the call shape is the kwargs-only form the function + actually accepts.""" + agent = _make_codex_agent() + agent._skill_nudge_interval = 1 # very low so any iter trips it + agent._iters_since_skill = 0 + agent.valid_tool_names = set(getattr(agent, "valid_tool_names", set())) + agent.valid_tool_names.add("skill_manage") + + with patch.object(agent, "_spawn_background_review", + return_value=None) as spawn: + agent.run_conversation("first") + # The fake session reports tool_iterations=1, which trips + # _skill_nudge_interval=1. So review should fire. + assert spawn.called + # Critical invariant: positional args must be empty, all real + # args must be kwargs (matching _spawn_background_review's + # actual signature). + call = spawn.call_args + assert call.args == (), ( + f"expected no positional args, got {call.args!r} — " + "would crash _spawn_background_review at runtime" + ) + assert "messages_snapshot" in call.kwargs + + def test_chat_completions_loop_is_not_entered(self, fake_session): + """The early-return must bypass the regular API call loop entirely. + We confirm by patching the SDK call and asserting it's never invoked.""" + agent = _make_codex_agent() + # The chat_completions loop calls self.client.chat.completions.create(...) + # If our early-return works, that path is dead. + with patch.object(agent, "client") as client_mock, patch.object( + agent, "_spawn_background_review", return_value=None + ): + agent.run_conversation("hi") + assert not client_mock.chat.completions.create.called + + +class TestReviewForkApiModeDowngrade: + """When the parent agent runs on codex_app_server, the background + review fork must downgrade to codex_responses — otherwise the fork + can't dispatch agent-loop tools (memory, skill_manage) which is the + whole point of the review.""" + + def test_codex_app_server_parent_downgrades_review_fork(self): + """Live test against the real _spawn_background_review code path: + verify the review_agent gets api_mode=codex_responses when the + parent is codex_app_server.""" + from unittest.mock import MagicMock, patch as _patch + agent = _make_codex_agent() + # Pretend memory + skills are configured so the review fork + # reaches the AIAgent constructor. + agent._memory_store = MagicMock() + agent._memory_enabled = True + agent._user_profile_enabled = True + # Mock _current_main_runtime to return the parent's codex_app_server + # state so we can confirm the helper detects + downgrades it. + agent._current_main_runtime = lambda: { + "api_mode": "codex_app_server", + "base_url": "https://chatgpt.com/backend-api/codex", + "api_key": "stub-token", + } + # Capture what AIAgent gets constructed with inside the helper. + captured = {} + + def _capture_init(self, **kwargs): + captured.update(kwargs) + # Set bare attributes the rest of the spawn function reads + # so it can finish without exploding. + self.api_mode = kwargs.get("api_mode") + self.provider = kwargs.get("provider") + self.model = kwargs.get("model") + self._memory_write_origin = None + self._memory_write_context = None + self._memory_store = None + self._memory_enabled = False + self._user_profile_enabled = False + self._memory_nudge_interval = 0 + self._skill_nudge_interval = 0 + self.suppress_status_output = False + self._session_messages = [] + + def _no_op_run_conv(*a, **kw): + return {"final_response": "", "messages": []} + self.run_conversation = _no_op_run_conv + + def _no_op_close(*a, **kw): + return None + self.close = _no_op_close + + with _patch("run_agent.AIAgent.__init__", _capture_init): + agent._spawn_background_review( + messages_snapshot=[{"role": "user", "content": "x"}], + review_memory=True, + review_skills=False, + ) + # Wait for the spawned thread to actually execute + import time + for _ in range(30): + if "api_mode" in captured: + break + time.sleep(0.1) + + assert captured.get("api_mode") == "codex_responses", ( + f"review fork should be downgraded to codex_responses when " + f"parent is codex_app_server; got {captured.get('api_mode')!r}" + ) + + +class TestErrorHandling: + def test_session_exception_returns_partial_with_error(self, monkeypatch): + def boom_run_turn(self, user_input, **kwargs): + raise RuntimeError("subprocess died") + + monkeypatch.setattr(CodexAppServerSession, "ensure_started", + lambda self: "t1") + monkeypatch.setattr(CodexAppServerSession, "run_turn", boom_run_turn) + + agent = _make_codex_agent() + with patch.object(agent, "_spawn_background_review", return_value=None): + result = agent.run_conversation("hi") + assert result["completed"] is False + assert result["partial"] is True + assert "subprocess died" in result["error"] + assert "codex-runtime auto" in result["final_response"] + + def test_interrupted_turn_marked_partial(self, monkeypatch): + def interrupted_turn(self, user_input, **kwargs): + return TurnResult( + final_text="", + projected_messages=[], + tool_iterations=0, + interrupted=True, + error="user interrupted", + turn_id="t", + thread_id="th", + ) + monkeypatch.setattr(CodexAppServerSession, "ensure_started", + lambda self: "th") + monkeypatch.setattr(CodexAppServerSession, "run_turn", interrupted_turn) + + agent = _make_codex_agent() + with patch.object(agent, "_spawn_background_review", return_value=None): + result = agent.run_conversation("hi") + assert result["completed"] is False + assert result["partial"] is True + assert result["error"] == "user interrupted" diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md index b17036ade44..409ddf8fe35 100644 --- a/website/docs/reference/environment-variables.md +++ b/website/docs/reference/environment-variables.md @@ -515,6 +515,8 @@ Advanced per-platform knobs for throttling the outbound message batcher. Most us | `HERMES_HUMAN_DELAY_MIN_MS` | Custom delay range minimum (ms) | | `HERMES_HUMAN_DELAY_MAX_MS` | Custom delay range maximum (ms) | | `HERMES_QUIET` | Suppress non-essential output (`true`/`false`) | +| `CODEX_HOME` | When [Codex app-server runtime](../user-guide/features/codex-app-server-runtime) is enabled, override the directory Codex CLI reads its config + auth from (default: `~/.codex`). Hermes' migration writes the managed block to `/config.toml`. | +| `HERMES_KANBAN_TASK` | Set by the kanban dispatcher when spawning a worker (task UUID). Workers and the spawned `hermes-tools` MCP subprocess inherit it so kanban tools gate correctly. Don't set manually. | | `HERMES_API_TIMEOUT` | LLM API call timeout in seconds (default: `1800`) | | `HERMES_API_CALL_STALE_TIMEOUT` | Non-streaming stale-call timeout in seconds (default: `300`). Auto-disabled for local providers when left unset. Also configurable via `providers..stale_timeout_seconds` or `providers..models..stale_timeout_seconds` in `config.yaml`. | | `HERMES_STREAM_READ_TIMEOUT` | Streaming socket read timeout in seconds (default: `120`). Auto-increased to `HERMES_API_TIMEOUT` for local providers. Increase if local LLMs time out during long code generation. | diff --git a/website/docs/reference/slash-commands.md b/website/docs/reference/slash-commands.md index 718da1350aa..377c31c4477 100644 --- a/website/docs/reference/slash-commands.md +++ b/website/docs/reference/slash-commands.md @@ -50,6 +50,7 @@ Type `/` in the CLI to open the autocomplete menu. Built-in commands are case-in |---------|-------------| | `/config` | Show current configuration | | `/model [model-name]` | Show or change the current model. Supports: `/model claude-sonnet-4`, `/model provider:model` (switch providers), `/model custom:model` (custom endpoint), `/model custom:name:model` (named custom provider), `/model custom` (auto-detect from endpoint), and user-defined aliases (`/model fav`, `/model grok` — see [Custom model aliases](#custom-model-aliases)). Use `--global` to persist the change to config.yaml. **Note:** `/model` can only switch between already-configured providers. To add a new provider, exit the session and run `hermes model` from your terminal. | +| `/codex-runtime [auto\|codex_app_server\|on\|off]` | Toggle the optional [Codex app-server runtime](../user-guide/features/codex-app-server-runtime) for OpenAI/Codex models. `auto` (default) uses Hermes' standard chat completions; `codex_app_server` hands turns to a `codex app-server` subprocess for native shell, apply_patch, ChatGPT subscription auth, and migrated Codex plugins. Effective on next session. | | `/personality` | Set a predefined personality | | `/verbose` | Cycle tool progress display: off → new → all → verbose. Can be [enabled for messaging](#notes) via config. | | `/fast [normal\|fast\|status]` | Toggle fast mode — OpenAI Priority Processing / Anthropic Fast Mode. Options: `normal`, `fast`, `status`. | @@ -180,6 +181,7 @@ The messaging gateway supports the following built-in commands inside Telegram, | `/status` | Show session info. | | `/stop` | Kill all running background processes and interrupt the running agent. | | `/model [provider:model]` | Show or change the model. Supports provider switches (`/model zai:glm-5`), custom endpoints (`/model custom:model`), named custom providers (`/model custom:local:qwen`), auto-detect (`/model custom`), and user-defined aliases (`/model fav`, `/model grok` — see [Custom model aliases](#custom-model-aliases)). Use `--global` to persist the change to config.yaml. **Note:** `/model` can only switch between already-configured providers. To add a new provider or set up API keys, use `hermes model` from your terminal (outside the chat session). | +| `/codex-runtime [auto\|codex_app_server\|on\|off]` | Toggle the optional [Codex app-server runtime](../user-guide/features/codex-app-server-runtime). Persists to `model.openai_runtime` in config.yaml and evicts the cached agent so the next message picks up the new runtime. Effective on next session. | | `/personality [name]` | Set a personality overlay for the session. | | `/fast [normal\|fast\|status]` | Toggle fast mode — OpenAI Priority Processing / Anthropic Fast Mode. | | `/retry` | Retry the last message. | diff --git a/website/docs/user-guide/features/codex-app-server-runtime.md b/website/docs/user-guide/features/codex-app-server-runtime.md new file mode 100644 index 00000000000..5d4b068088b --- /dev/null +++ b/website/docs/user-guide/features/codex-app-server-runtime.md @@ -0,0 +1,443 @@ +--- +title: Codex App-Server Runtime (optional) +sidebar_label: Codex App-Server Runtime +--- + +# Codex App-Server Runtime + +Hermes can optionally hand `openai/*` and `openai-codex/*` turns to the [Codex CLI app-server](https://github.com/openai/codex) instead of running its own tool loop. When enabled, terminal commands, file edits, sandboxing, and MCP tool calls all execute inside Codex's runtime — Hermes becomes the shell around it (sessions DB, slash commands, gateway, memory and skill review). + +This is **opt-in only**. Default Hermes behavior is unchanged unless you flip the flag. Hermes never auto-routes you onto this runtime. + +## Why + +- Run OpenAI agent turns against your **ChatGPT subscription** (no API key required) using the same auth flow Codex CLI uses. +- Use **Codex's own toolset and sandbox** — `shell` for terminal/read/write/search, `apply_patch` for structured edits, `update_plan` for planning, all running inside seatbelt/landlock sandboxing. +- **Native Codex plugins** — Linear, GitHub, Gmail, Calendar, Canva, etc. — installed via `codex plugin` are auto-migrated and active in your Hermes session. +- **Hermes' richer tools come along** — web_search, web_extract, browser automation, vision, image generation, skills, and TTS work via an MCP callback. Codex calls back into Hermes for tools it doesn't have built in. +- **Memory and skill nudges keep working** — Codex's events are projected into Hermes' message shape so the self-improvement loop sees a normal-looking transcript. + +## What tools the model actually has + +This is the part most users want to know up front. When this runtime is on, the model running your turn has three independent sources of tools: + +### 1. Codex's built-in toolset (always on) + +These ship with `codex app-server` itself — no Hermes involvement, no MCP, no plugins. All five are available the moment the runtime starts: + +- **`shell`** — runs arbitrary shell commands inside the sandbox. This is how the model reads files (`cat`, `head`, `tail`), writes them (`echo > foo`, heredocs), searches them (`find`, `rg`, `grep`), navigates directories (`ls`, `cd`), runs builds, manages processes, and anything else you'd do in bash. +- **`apply_patch`** — applies a structured multi-file diff in Codex's patch format. The model uses this for non-trivial code edits (adding a function, refactoring across files); shell heredocs are still available for one-off writes. +- **`update_plan`** — codex's internal todo / plan tracker. Equivalent of Hermes' `todo` tool, but managed entirely inside codex's runtime. +- **`view_image`** — load a local image file into the conversation so the model can see it. +- **`web_search`** — codex has its own built-in web search when configured. Hermes also exposes `web_search` (Firecrawl-backed) via the callback below; the model picks whichever it prefers. + +So **anything you'd do via terminal — read/write/search/find/run — codex does natively**. The sandbox profile (`:workspace` by default when you enable the runtime) controls what's writable. + +### 2. Native Codex plugins (auto-migrated from your `codex plugin` install) + +When you enable the runtime, Hermes queries codex's `plugin/list` RPC and writes a `[plugins."@openai-curated"]` entry for every plugin you have installed. The plugins themselves are managed by codex and authorized once via codex's own UI. + +Examples (the ones the OpenClaw thread highlighted as "YouTube-video-worthy"): + +- **Linear** — find/update issues +- **GitHub** — search code, view PRs, comment +- **Gmail** — read/send mail +- **Google Calendar** — create/find events +- **Outlook calendar/email** — same shape via the Microsoft connector +- **Canva** — design generation +- ...whatever else you've installed via `codex plugin marketplace add openai-curated` + `codex plugin install ...` + +What's NOT migrated: +- Plugins you haven't installed yet — install them in Codex first. +- ChatGPT app marketplace entries (`app/list`) — these are already enabled inside codex by virtue of your account auth. + +### 3. Hermes tool callback (MCP server, registered in `~/.codex/config.toml`) + +Hermes registers itself as an MCP server so codex can call back for tools codex doesn't ship with. Available via the callback: + +- **`web_search`** / **`web_extract`** — Firecrawl-backed; tends to be cleaner than scraping for structured content. +- **`browser_navigate` / `browser_click` / `browser_type` / `browser_press` / `browser_snapshot` / `browser_scroll` / `browser_back` / `browser_get_images` / `browser_console` / `browser_vision`** — full browser automation via Camofox or Browserbase. +- **`vision_analyze`** — call a separate vision model to inspect an image (different from codex's `view_image` which loads it into the conversation). +- **`image_generate`** — image generation through Hermes' image_gen plugin chain. +- **`skill_view` / `skills_list`** — read from Hermes' skill library. +- **`text_to_speech`** — TTS through Hermes' configured provider. + +When the model wants one of these, codex spawns the `hermes_tools_mcp_server` subprocess via stdio MCP, the call is dispatched through `model_tools.handle_function_call()` (same code path as Hermes' default runtime), and the result is returned to codex like any other MCP response. + +### What's NOT available on this runtime + +These four Hermes tools require the running AIAgent context (mid-loop state) to dispatch, and a stateless MCP callback can't drive them. Switch back to the default runtime (`/codex-runtime auto`) when you need any of them: + +- **`delegate_task`** — spawn subagents +- **`memory`** — Hermes' persistent memory store +- **`session_search`** — cross-session search +- **`todo`** — Hermes' todo store (codex's `update_plan` is the in-runtime equivalent) + +## Workflow features (`/goal`, kanban, cron) + +### `/goal` (the Ralph loop) + +**Works on this runtime.** Goals persist in `state_meta` keyed by session id, the continuation prompt feeds back as a normal user message through `run_conversation()`, and codex executes the next turn natively. The goal judge runs via the auxiliary client (configured via `auxiliary.goal_judge` in config.yaml), independent of which runtime is active. The judge's "blocked, needs user input" verdict is a clean escape if codex stalls on approvals. + +**One thing to be aware of:** each continuation prompt is a fresh codex turn, which means codex re-evaluates command approval policy from scratch. If you're doing a long-running goal with lots of writes, expect more approval prompts than you'd see on a single in-session task. Set `default_permissions = ":workspace"` (which Hermes does automatically when you enable the runtime) so simple workspace writes don't require prompting. + +### Kanban (multi-agent worktree dispatch) + +**Works on this runtime, with one subtle dependency.** The kanban dispatcher spawns each worker as a separate `hermes chat -q` subprocess that reads the user's config — which means if `model.openai_runtime: codex_app_server` is set globally, workers also come up on the codex runtime. + +What works inside a codex-runtime worker: +- Codex's full toolset (shell, apply_patch, update_plan, view_image, web_search) — the worker does its actual task work natively +- The migrated codex plugins — Linear, GitHub, etc. +- The Hermes tool callback for browser_*, vision, image_gen, skills, TTS + +What also works because the MCP callback exposes them: +- **`kanban_complete` / `kanban_block` / `kanban_comment` / `kanban_heartbeat`** — the worker handoff tools. These read `HERMES_KANBAN_TASK` from env (set by the dispatcher), gate access correctly, and write to `~/.hermes/kanban.db`. Without these in the callback, a worker on this runtime could do its task but couldn't report back, hanging until the dispatcher's timeout. +- **`kanban_show` / `kanban_list`** — read-only board queries for the worker to check its own context. +- **`kanban_create` / `kanban_unblock` / `kanban_link`** — orchestrator-only operations. Available for orchestrator agents running on the codex runtime that need to dispatch new tasks. + +The kanban tools are gated by `HERMES_KANBAN_TASK` env var the dispatcher sets — that var is propagated to the codex subprocess (codex inherits env) and from there to the spawned `hermes-tools` MCP server subprocess. So the tools see the right task id and gate correctly. + +### Cron jobs + +**Not specifically tested.** Cron jobs run via `cronjob` → `AIAgent.run_conversation`, the same code path as the CLI. If the cron job's config has `openai_runtime: codex_app_server` it'll run on codex. The same tool-availability rules apply — codex built-ins + plugins + MCP callback work, agent-loop tools (delegate_task, memory, session_search, todo) don't. If your cron job relies on those, scope the cron to a profile that uses the default runtime. + +## Trade-offs + +| | Hermes default runtime | Codex app-server (opt-in) | +|---|---|---| +| `delegate_task` subagents | yes | not available — needs agent loop context | +| `memory`, `session_search`, `todo` | yes | not available — needs agent loop context | +| `web_search`, `web_extract` | yes | yes (via MCP callback) | +| Browser automation (Camofox/Browserbase) | yes | yes (via MCP callback) | +| `vision_analyze`, `image_generate` | yes | yes (via MCP callback) | +| `skill_view`, `skills_list` | yes | yes (via MCP callback) | +| `text_to_speech` | yes | yes (via MCP callback) | +| Codex `shell` (terminal/read/write/search/find/run) | — | yes (Codex built-in) | +| Codex `apply_patch` (structured multi-file edits) | — | yes (Codex built-in) | +| Codex `update_plan` (in-runtime todo) | — | yes (Codex built-in) | +| Codex `view_image` (load image into conversation) | — | yes (Codex built-in) | +| Codex sandbox (seatbelt/landlock, profiles) | — | yes (Codex built-in) | +| ChatGPT subscription auth | — | yes (via `openai-codex` provider) | +| Native Codex plugins (Linear, GitHub, etc.) | — | yes (auto-migrated) | +| User MCP servers | yes | yes (auto-migrated to codex) | +| Memory + skill review (background) | yes | yes (via item projection) | +| Multi-turn conversations | yes | yes | +| `/goal` (Ralph loop) | yes | yes | +| Kanban worker dispatch | yes | yes (via callback) | +| Kanban orchestrator tools | yes | yes (via callback) | +| All gateway platforms | yes | yes | +| Non-OpenAI providers | yes | n/a — OpenAI/Codex-scoped | + +## Prerequisites + +1. **Codex CLI installed:** + ```bash + npm i -g @openai/codex + codex --version # 0.130.0 or newer + ``` +2. **Codex OAuth login.** The codex subprocess reads `~/.codex/auth.json`. Two ways to populate it: + ```bash + codex login # writes tokens to ~/.codex/auth.json + ``` + Hermes' own `hermes auth login codex` writes to `~/.hermes/auth.json` — that's a separate session. **Run `codex login` separately** if you haven't. + +3. **(Optional) Install the Codex plugins you want.** When you enable the runtime, Hermes auto-migrates whichever curated plugins you've already installed via Codex CLI: + ```bash + codex plugin marketplace add openai-curated + # then via codex's TUI, install Linear / GitHub / Gmail / etc. + ``` + Hermes will discover them and write `[plugins."@openai-curated"]` entries to `~/.codex/config.toml` automatically. + +## Enabling + +In a Hermes session: + +``` +/codex-runtime codex_app_server +``` + +That command: +- Verifies the `codex` CLI is installed (blocks with an install hint if not). +- Persists `model.openai_runtime: codex_app_server` to your config.yaml. +- Migrates user MCP servers from `~/.hermes/config.yaml` to `~/.codex/config.toml`. +- **Discovers and migrates installed native Codex plugins** (Linear, GitHub, Gmail, Calendar, Canva, etc.) by querying Codex's `plugin/list` RPC. +- **Registers Hermes' own tools as an MCP server** so the codex subprocess can call back for tools codex doesn't ship with. +- **Writes `default_permissions = ":workspace"`** so the sandbox allows writes within the workspace without prompting for every operation. +- Tells you what was migrated. Takes effect on the **next** session — the current cached agent keeps the prior runtime so prompt caches stay valid. + +Synonyms: `/codex-runtime on`, `/codex-runtime off`, `/codex-runtime auto`. + +To check current state without changing anything: +``` +/codex-runtime +``` + +You can also set it manually in `~/.hermes/config.yaml`: +```yaml +model: + openai_runtime: codex_app_server # default is "auto" (= Hermes runtime) +``` + +## Self-improvement loop (memory + skill nudges) + +Hermes' background self-improvement fires on counter thresholds: + +- Every 10 user prompts → a forked review agent looks at the conversation and decides whether anything should be saved to memory. +- Every 10 tool iterations within a single turn → same idea but for skills (`skill_manage` writes). + +**Both keep working on the codex runtime.** The codex path projects each completed `commandExecution` / `fileChange` / `mcpToolCall` / `dynamicToolCall` item into a synthetic `assistant tool_call` + `tool` result message, so by the time the review runs it sees the same shape it sees on the default Hermes runtime. + +How the wiring stays equivalent: + +| | Default runtime | Codex runtime | +|---|---|---| +| `_turns_since_memory` increments | per user prompt, in run_conversation pre-loop | same code path, before the early-return | +| `_iters_since_skill` increments | per tool iteration in the chat-completions loop | by `turn.tool_iterations` after the codex turn returns | +| Memory trigger (`_turns_since_memory >= _memory_nudge_interval`) | computed in pre-loop, fires after response | computed in pre-loop, passed through to codex helper | +| Skill trigger (`_iters_since_skill >= _skill_nudge_interval`) | computed after the loop | computed after the codex turn | +| `_spawn_background_review(messages_snapshot=..., review_memory=..., review_skills=...)` | called when either trigger fires | called identically when either trigger fires | + +One detail: the review fork itself needs to call Hermes' agent-loop tools (`memory`, `skill_manage`), which require Hermes' own dispatch. So when the parent agent is on `codex_app_server`, the review fork is **downgraded to `codex_responses`** — same OAuth credentials, same `openai-codex` provider, but talks to OpenAI's Responses API directly so Hermes owns the loop and the agent-loop tools work. This is invisible to the user. + +Net effect: enable the codex runtime and your memory + skill nudges keep firing exactly as they would otherwise. + +## How approvals work + +Codex requests approval before executing commands or applying patches. These get translated into Hermes' standard "Dangerous Command" prompt: + +``` +╭───────────────────────────────────────╮ +│ Dangerous Command │ +│ │ +│ /bin/bash -lc 'echo hello > foo.txt' │ +│ │ +│ ❯ 1. Allow once │ +│ 2. Allow for this session │ +│ 3. Deny │ +│ │ +│ Codex requests exec in /your/cwd │ +╰───────────────────────────────────────╯ +``` + +- **Allow once** → approve this single command. +- **Allow for this session** → Codex won't re-prompt for similar commands. +- **Deny** → command is rejected; Codex continues in read-only mode. + +For `apply_patch` (file edit) approvals, Hermes shows a summary of what changed (`1 add, 1 update: /tmp/new.py, /tmp/old.py`) when codex provides the data via the corresponding `fileChange` item. + +## Permission profiles + +Codex has three built-in permission profiles: +- `:read-only` — no writes; every shell command requires approval +- `:workspace` — writes within the current workspace allowed without prompts (Hermes' default when you enable the runtime) +- `:danger-no-sandbox` — no sandbox at all (don't use this unless you understand it) + +You can override the default in `~/.codex/config.toml` outside Hermes' managed block: + +```toml +default_permissions = ":read-only" +``` + +(Hermes will preserve your override on re-migration as long as it lives outside the `# managed by hermes-agent` markers.) + +## Auxiliary tasks and ChatGPT subscription token cost + +When this runtime is on with the `openai-codex` provider, **auxiliary tasks (title generation, context compression, vision auto-detect, session search summarization, the background self-improvement review fork) also flow through your ChatGPT subscription by default**, because Hermes' auxiliary client uses the main provider/model when no per-task override is set. + +This isn't specific to `codex_app_server` — it's true for the existing `codex_responses` path too — but it's more visible here because you're explicitly opting in for the subscription billing. + +To route specific aux tasks to a cheaper / different model, set explicit overrides in `~/.hermes/config.yaml`: + +```yaml +auxiliary: + title_generation: + provider: openrouter + model: google/gemini-3-flash-preview + context_compression: + provider: openrouter + model: google/gemini-3-flash-preview + vision_detect: + provider: openrouter + model: google/gemini-3-flash-preview + session_search: + provider: openrouter + model: google/gemini-3-flash-preview + goal_judge: + provider: openrouter + model: google/gemini-3-flash-preview +``` + +The self-improvement review fork inherits the main runtime via `_current_main_runtime()` and Hermes downgrades it from `codex_app_server` to `codex_responses` automatically (so the fork can actually call `memory` and `skill_manage` — Hermes' own agent-loop tools). That fork still uses your subscription auth unless you've routed aux tasks elsewhere. + +## Editing `~/.codex/config.toml` safely + +Hermes wraps everything it manages between two marker comments: + +```toml +# managed by hermes-agent — `hermes codex-runtime migrate` regenerates this section +default_permissions = ":workspace" +[mcp_servers.filesystem] +... +[plugins."github@openai-curated"] +... +# end hermes-agent managed section +``` + +Anything **outside** that block is yours. Re-running migration (via `/codex-runtime codex_app_server` or whenever you toggle the runtime on) replaces the managed block in place but preserves user content above and below it verbatim. This means you can: + +- Add your own MCP servers Hermes doesn't know about +- Override `default_permissions` to `:read-only` if you prefer to be prompted +- Configure codex-only options (model, providers, otel, etc.) +- Add user-defined permission profiles in `[permissions.]` tables + +Anything you add **inside** the managed block will get clobbered on the next migration. If you need a tweak that requires editing the managed block, file an issue and we'll add the knob. + +## Multi-profile / multi-tenant setups + +By default, Hermes points the codex subprocess at `~/.codex/` regardless of which Hermes profile is active. This means `hermes -p work` and `hermes -p personal` share the same Codex auth, plugins, and config. For most users this is the right behavior — it matches what running `codex` CLI directly would do. + +If you want per-profile Codex isolation (separate auth, separate installed plugins, separate config), set `CODEX_HOME` explicitly per profile. The cleanest way is to point at a directory under your `HERMES_HOME`: + +```bash +# Inside the work profile, you might wrap hermes: +CODEX_HOME=~/.hermes/profiles/work/codex hermes chat +``` + +You'll need to re-run `codex login` once with that `CODEX_HOME` set so the OAuth tokens land in the profile-scoped location. After that, `hermes -p work` will operate on isolated Codex state. + +We don't auto-scope this because moving an existing user's `~/.codex/` would silently invalidate their Codex CLI auth — anyone who already ran `codex login` would have to re-authenticate. Opt-in feels safer than surprising users. + +## HOME environment variable passthrough + +Hermes does NOT rewrite `HOME` when spawning the codex app-server subprocess (we use `os.environ.copy()` and only overlay `CODEX_HOME` and `RUST_LOG`). This means: + +- Commands codex runs via its `shell` tool see the real user `HOME` and find `~/.gitconfig`, `~/.gh/`, `~/.aws/`, `~/.npmrc`, etc. correctly. +- Codex's internal state stays isolated through `CODEX_HOME` (which points at `~/.codex/` by default). + +This matches the boundary OpenClaw arrived at after some early experimentation: isolate Codex's state, leave the user's home alone. (Cf. openclaw/openclaw#81562.) + +## MCP server migration + +Hermes' `mcp_servers` config is auto-translated to the TOML format Codex expects. The migration runs every time you enable the runtime and is idempotent — re-runs replace the managed section but preserve any user-edited Codex config. + +What translates: + +| Hermes (`config.yaml`) | Codex (`config.toml`) | +|---|---| +| `command` + `args` + `env` | stdio transport | +| `url` + `headers` | streamable_http transport | +| `timeout` | `tool_timeout_sec` | +| `connect_timeout` | `startup_timeout_sec` | +| `enabled: false` | `enabled = false` | + +What's not migrated: +- Hermes-specific keys like `sampling` (Codex's MCP client has no equivalent — these are dropped with a per-server warning). + +## Native Codex plugin migration + +Plugins installed via `codex plugin` (Linear, GitHub, Gmail, Calendar, Canva, etc.) are discovered through Codex's `plugin/list` RPC. For each plugin where `installed: true`, Hermes writes a `[plugins."@openai-curated"]` block enabling it in your Hermes session. + +This means: when your friend says "I have Calendar and GitHub set up in my Codex CLI" and they enable Hermes' codex runtime, Hermes activates those automatically. No re-configuration needed. + +What's NOT migrated: +- Plugins not yet installed in Codex CLI. Install them via `codex plugin` first. +- ChatGPT app marketplace entries (the per-account `app/list` results — these are already enabled inside codex by virtue of your account auth). +- Plugin OAuth — you authorize each plugin once in Codex itself; Hermes doesn't touch credentials. + +## Hermes tool callback (the new MCP server) + +Codex's built-in toolset covers shell/file ops/patches but doesn't have web search, browser automation, vision, image generation, etc. To keep those usable in a codex turn, Hermes registers itself as an MCP server in `~/.codex/config.toml`: + +```toml +[mcp_servers.hermes-tools] +command = "/path/to/python" +args = ["-m", "agent.transports.hermes_tools_mcp_server"] +env = { HERMES_HOME = "/your/.hermes", PYTHONPATH = "...", HERMES_QUIET = "1" } +startup_timeout_sec = 30.0 +tool_timeout_sec = 600.0 +``` + +When the model calls `web_search` (or another exposed Hermes tool), codex spawns the `hermes_tools_mcp_server` subprocess via stdio, the request is dispatched through `model_tools.handle_function_call()`, and the result is projected back to codex like any other MCP response. + +**Tools available via the callback:** `web_search`, `web_extract`, `browser_navigate`, `browser_click`, `browser_type`, `browser_press`, `browser_snapshot`, `browser_scroll`, `browser_back`, `browser_get_images`, `browser_console`, `browser_vision`, `vision_analyze`, `image_generate`, `skill_view`, `skills_list`, `text_to_speech`. + +**Tools NOT available:** `delegate_task`, `memory`, `session_search`, `todo`. These need the running AIAgent context to dispatch (mid-loop state) and a stateless MCP callback can't drive them. Use the default Hermes runtime (`/codex-runtime auto`) when you need these. + +## Disabling + +Switch back at any time: + +``` +/codex-runtime auto +``` + +Effective on the next session. The Codex managed block stays in `~/.codex/config.toml` so you can re-enable later without losing config — or remove it manually if you prefer. + +## Limitations + +This runtime is **opt-in beta**. Working as of Hermes Agent 2026.5 + Codex CLI 0.130.0: + +- Multi-turn conversations +- `commandExecution` and `fileChange` (apply_patch) approvals via Hermes UI +- MCP tool calls (verified against `@modelcontextprotocol/server-filesystem` and the new `hermes-tools` callback) +- Native Codex plugin migration (verified against Linear / GitHub / Calendar inventory) +- Deny/cancel paths +- Toggle on/off cycle +- Memory and skill nudge counters (verified live via integration tests) +- Hermes web_search through codex (verified live: "OpenAI Codex CLI – Getting Started" returned end-to-end) + +Known limitations: + +- **Hermes auth and codex auth are separate sessions.** You need both `codex login` AND `hermes auth login codex` for the cleanest UX (the runtime uses codex's session for the LLM call). This is a deliberate design choice in Hermes' `_import_codex_cli_tokens` — Hermes won't share OAuth state with codex CLI to avoid clobbering each other on token refresh. +- **`delegate_task`, `memory`, `session_search`, `todo` are unavailable on this runtime.** They need the running AIAgent context which a stateless MCP callback can't provide. Use `/codex-runtime auto` when you need these. +- **No inline patch preview in approval prompts when codex doesn't track the changeset.** Codex's `fileChange` approval params don't always carry the changeset. Hermes caches the data from the corresponding `item/started` notification when possible, but if approval arrives before the item has streamed, the prompt falls back to whatever `reason` codex provides. +- **Sub-second cancellation isn't guaranteed.** Mid-stream interrupts (Ctrl+C while codex is responding) are sent via `turn/interrupt`, but if codex has already flushed the final message, you get the response anyway. + +If you find a bug, [open an issue](https://github.com/NousResearch/hermes-agent/issues) with the output of `hermes logs --since 5m`. Mention `codex-runtime` in the title so it's easy to triage. + +## Architecture + +``` + ┌─── Hermes shell (CLI / TUI / gateway) ───┐ + │ sessions DB · slash commands · memory │ + │ & skill review · cron · session pickers │ + └──┬──────────────────────────────────────┬┘ + │ user_message final │ + ▼ text + │ + ┌──────────────────────────────────┐ projected │ + │ AIAgent.run_conversation() │ messages │ + │ if api_mode == codex_app_server │ │ + │ → CodexAppServerSession │ │ + │ else: chat_completions / codex_responses (default) + └────┬─────────────────────────────┘ │ + │ JSON-RPC over stdio │ + ▼ │ + ┌──────────────────────────────────┐ │ + │ codex app-server (subprocess) │──────────────┘ + │ thread/start, turn/start │ + │ item/* notifications │ + │ shell + apply_patch + update_plan│ + │ view_image + sandbox │ + │ ┌─────────────────────────┐ │ + │ │ MCP client │ │ + │ │ ├─ user MCP servers │ │ + │ │ ├─ native plugins │ │ + │ │ │ (linear, github, │ │ + │ │ │ gmail, calendar, │ │ + │ │ │ canva, ...) │ │ + │ │ └─ hermes-tools ───────┼─────────────────┐ + │ │ (callback to │ │ │ + │ │ Hermes' richer │ │ │ + │ │ tools) │ │ │ + │ └─────────────────────────┘ │ │ + └──────────────────────────────────┘ │ + │ + ▼ + ┌──────────────────────────────────────────────────────────┐ + │ hermes_tools_mcp_server.py (subprocess on demand) │ + │ web_search, web_extract, browser_*, vision_analyze, │ + │ image_generate, skill_view, skills_list, text_to_speech│ + └──────────────────────────────────────────────────────────┘ +``` + +For implementation details, see [PR #24182](https://github.com/NousResearch/hermes-agent/pull/24182) and the [Codex app-server protocol README](https://github.com/openai/codex/blob/main/codex-rs/app-server/README.md). diff --git a/website/sidebars.ts b/website/sidebars.ts index f706d2a607d..6bdd5d296a0 100644 --- a/website/sidebars.ts +++ b/website/sidebars.ts @@ -68,6 +68,7 @@ const sidebars: SidebarsConfig = { 'user-guide/features/cron', 'user-guide/features/delegation', 'user-guide/features/kanban', + 'user-guide/features/codex-app-server-runtime', 'user-guide/features/kanban-tutorial', 'user-guide/features/kanban-worker-lanes', 'user-guide/features/goals',