diff --git a/agent/agent_init.py b/agent/agent_init.py index afa3d3aa62a..41f7cc11bbb 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1307,6 +1307,12 @@ def init_agent( _agent_section = {} agent._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto") + # Intent-ack continuation config: "auto" (default — codex_responses only, + # the historical gate), true (all api_modes), false (never), or a list of + # model-name substrings. Resolved against the active api_mode/model in the + # conversation loop's intent-ack block. + agent._intent_ack_continuation = _agent_section.get("intent_ack_continuation", "auto") + # Universal task-completion guidance toggle. Default True. Surfaced # as a separate flag from tool_use_enforcement because the guidance # applies to ALL models, not just the model families enforcement diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py index a63305dd063..21a14c97708 100644 --- a/agent/agent_runtime_helpers.py +++ b/agent/agent_runtime_helpers.py @@ -2205,8 +2205,21 @@ def looks_like_codex_intermediate_ack( user_message: str, assistant_content: str, messages: List[Dict[str, Any]], + require_workspace: bool = True, ) -> bool: - """Detect a planning/ack message that should continue instead of ending the turn.""" + """Detect a planning/ack message that should continue instead of ending the turn. + + ``require_workspace`` (default True) keeps the original codex-coding scope: + the ack must reference a filesystem/repo workspace. The conversation loop + passes ``require_workspace=False`` when the user has explicitly opted into + intent-ack continuation for all api_modes (``agent.intent_ack_continuation`` + is ``true`` or a model-list), so general autonomous workflows ("I'll run a + health check on the server", "I'll start the deployment") — which carry a + future-ack and an action verb but no filesystem reference — are caught too. + The future-ack + short-content + no-prior-tools + action-verb requirements + always apply, which is what keeps conversational "I'll help you brainstorm" + replies from tripping it. + """ if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages): return False @@ -2259,17 +2272,67 @@ def looks_like_codex_intermediate_ack( "path", ) + assistant_mentions_action = any(marker in assistant_text for marker in action_markers) + if not assistant_mentions_action: + return False + + # Opted-in (all-api_mode) path: a future-ack + action verb + no prior tool + # call is enough — the user asked us to keep going when the model only + # announces intent, regardless of whether a filesystem is involved. + if not require_workspace: + return True + user_text = (user_message or "").strip().lower() user_targets_workspace = ( any(marker in user_text for marker in workspace_markers) or "~/" in user_text or "/" in user_text ) - assistant_mentions_action = any(marker in assistant_text for marker in action_markers) assistant_targets_workspace = any( marker in assistant_text for marker in workspace_markers ) - return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action + return user_targets_workspace or assistant_targets_workspace + + +def intent_ack_continuation_mode(agent) -> str: + """Classify the resolved intent-ack continuation mode for this turn. + + Returns one of: + * ``"off"`` — never continue. + * ``"codex_only"`` — historical scope: continue only on the + ``codex_responses`` api_mode, and only for codebase/workspace acks + (``require_workspace=True``). + * ``"all"`` — user opted in for every api_mode; continue on any + future-ack + action verb (``require_workspace=False``). + + Mirrors the four-mode shape of ``agent.tool_use_enforcement``: ``"auto"`` + (default) → codex_only; ``True``/"true"/"always"/"yes"/"on" → all; + ``False``/"false"/"never"/"no"/"off" → off; ``list`` → all when a substring + matches the active model name, else off. + """ + mode = getattr(agent, "_intent_ack_continuation", "auto") + + if mode is True or (isinstance(mode, str) and mode.lower() in {"true", "always", "yes", "on"}): + return "all" + if mode is False or (isinstance(mode, str) and mode.lower() in {"false", "never", "no", "off"}): + return "off" + if isinstance(mode, list): + model_lower = (agent.model or "").lower() + return "all" if any(p.lower() in model_lower for p in mode if isinstance(p, str)) else "off" + # "auto" or any unrecognised value — historical codex-only behavior. + return "codex_only" if agent.api_mode == "codex_responses" else "off" + + +def intent_ack_continuation_enabled(agent) -> bool: + """Whether intent-ack continuation should fire at all for this turn. + + The ``codex_ack_continuations < 2`` per-turn cap and the + ``looks_like_codex_intermediate_ack`` detector are applied by the caller; + this only decides the on/off gate. Callers that also need to know whether + the workspace requirement applies should use ``intent_ack_continuation_mode`` + directly (``"codex_only"`` ⇒ require_workspace=True, ``"all"`` ⇒ False). + """ + return intent_ack_continuation_mode(agent) != "off" diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 983eec6df85..cb71e724159 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -4637,14 +4637,20 @@ def run_conversation( # status from earlier failed attempts in this turn. agent._clear_status_buffer() + from agent.agent_runtime_helpers import ( + intent_ack_continuation_mode, + ) + + _ack_mode = intent_ack_continuation_mode(agent) if ( - agent.api_mode == "codex_responses" + _ack_mode != "off" and agent.valid_tool_names and codex_ack_continuations < 2 and agent._looks_like_codex_intermediate_ack( user_message=user_message, assistant_content=final_response, messages=messages, + require_workspace=(_ack_mode == "codex_only"), ) ): codex_ack_continuations += 1 diff --git a/hermes_cli/config.py b/hermes_cli/config.py index a3496597e79..bee791543c5 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -939,6 +939,16 @@ DEFAULT_CONFIG = { # (force on/off for all models), or a list of model-name substrings # to match (e.g. ["gpt", "codex", "gemini", "qwen"]). "tool_use_enforcement": "auto", + # Intent-ack continuation: when the model opens a turn by narrating an + # action it will take ("I'll go check the logs...") but emits no tool + # call, intercept the turn-end, inject a "continue now, execute the + # tools" nudge, and loop instead of ending the turn (capped at 2 nudges + # per turn). This is the corrective sibling of tool_use_enforcement (the + # preventive prompt-side guard). Values: "auto" (default — fires only on + # the codex_responses api_mode, the historical behavior), true (all + # api_modes — fixes the Gemini/Claude "stops after stating intent" case), + # false (never), or a list of model-name substrings to match. + "intent_ack_continuation": "auto", # Universal "finish the job" guidance — short prompt block applied to # all models that targets two cross-family failure modes: (1) stopping # after a stub instead of finishing the artifact, (2) fabricating diff --git a/infographic/intent-ack-continuation/infographic.png b/infographic/intent-ack-continuation/infographic.png new file mode 100644 index 00000000000..e509b96a00a Binary files /dev/null and b/infographic/intent-ack-continuation/infographic.png differ diff --git a/run_agent.py b/run_agent.py index 3a3b6f4f0ef..506403ca5bb 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1424,10 +1424,13 @@ class AIAgent: user_message: str, assistant_content: str, messages: List[Dict[str, Any]], + require_workspace: bool = True, ) -> bool: """Forwarder — see ``agent.agent_runtime_helpers.looks_like_codex_intermediate_ack``.""" from agent.agent_runtime_helpers import looks_like_codex_intermediate_ack - return looks_like_codex_intermediate_ack(self, user_message, assistant_content, messages) + return looks_like_codex_intermediate_ack( + self, user_message, assistant_content, messages, require_workspace + ) def _extract_reasoning(self, assistant_message) -> Optional[str]: """Forwarder — see ``agent.agent_runtime_helpers.extract_reasoning``.""" diff --git a/tests/agent/test_intent_ack_continuation.py b/tests/agent/test_intent_ack_continuation.py new file mode 100644 index 00000000000..a529020aa7c --- /dev/null +++ b/tests/agent/test_intent_ack_continuation.py @@ -0,0 +1,160 @@ +"""Intent-ack continuation gate + detector behavior. + +Covers the config-driven generalization of the codex intent-ack continuation +(issue #27881): the historical ``codex_responses``-only path is byte-stable +under the default ``"auto"`` mode, while an explicit ``true``/model-list opt-in +extends the "you announced an action but called no tool — keep going" nudge to +every api_mode and relaxes the codebase/workspace requirement so general +autonomous workflows ("I'll run a health check on the server") are caught. + +These are invariant assertions about how the mode string and the detector +gates relate, not snapshots of the marker lists. +""" + +from types import SimpleNamespace +from typing import Union + +from agent.agent_runtime_helpers import ( + intent_ack_continuation_enabled, + intent_ack_continuation_mode, + looks_like_codex_intermediate_ack, +) + + +def _agent( + mode: Union[str, bool, list] = "auto", + api_mode="chat_completions", + model="anthropic/claude-sonnet-4", +): + # _strip_think_blocks is a no-op for these plain-text fixtures. + return SimpleNamespace( + _intent_ack_continuation=mode, + api_mode=api_mode, + model=model, + _strip_think_blocks=lambda c: c, + ) + + +# The reporter's exact repro (#27881): server-ops task, no filesystem reference. +REPRO_USER = ( + "check the current status of the server, grab the latest error logs, " + "and let me know if there's anything critical" +) +REPRO_ACK = "I will start by running a health check command on the server to see its current status." + +# The codex-coding case the detector was originally built for. +CODE_USER = "review the codebase in /app" +CODE_ACK = "Let me inspect the repository files first." + + +# ── mode resolution ──────────────────────────────────────────────────────── + + +def test_auto_is_codex_only(): + assert intent_ack_continuation_mode(_agent("auto", "codex_responses")) == "codex_only" + assert intent_ack_continuation_mode(_agent("auto", "chat_completions")) == "off" + assert intent_ack_continuation_mode(_agent("auto", "anthropic")) == "off" + + +def test_true_is_all_api_modes(): + for am in ("chat_completions", "anthropic", "codex_responses"): + assert intent_ack_continuation_mode(_agent(True, am)) == "all" + for s in ("true", "always", "yes", "on", "ON"): + assert intent_ack_continuation_mode(_agent(s, "chat_completions")) == "all" + + +def test_false_is_off_even_for_codex(): + assert intent_ack_continuation_mode(_agent(False, "codex_responses")) == "off" + for s in ("false", "never", "no", "off"): + assert intent_ack_continuation_mode(_agent(s, "codex_responses")) == "off" + + +def test_list_matches_model_substring(): + assert intent_ack_continuation_mode( + _agent(["gemini", "qwen"], "chat_completions", "google/gemini-3-pro") + ) == "all" + assert intent_ack_continuation_mode( + _agent(["gemini", "qwen"], "chat_completions", "anthropic/claude-sonnet-4") + ) == "off" + + +def test_unrecognised_value_falls_back_to_auto(): + assert intent_ack_continuation_mode(_agent("garbage", "codex_responses")) == "codex_only" + assert intent_ack_continuation_mode(_agent("garbage", "chat_completions")) == "off" + + +def test_missing_attr_defaults_to_auto(): + bare = SimpleNamespace(api_mode="chat_completions", model="x", _strip_think_blocks=lambda c: c) + assert intent_ack_continuation_mode(bare) == "off" + bare_codex = SimpleNamespace(api_mode="codex_responses", model="x", _strip_think_blocks=lambda c: c) + assert intent_ack_continuation_mode(bare_codex) == "codex_only" + + +def test_enabled_is_mode_not_off(): + assert intent_ack_continuation_enabled(_agent(True, "chat_completions")) is True + assert intent_ack_continuation_enabled(_agent("auto", "codex_responses")) is True + assert intent_ack_continuation_enabled(_agent("auto", "chat_completions")) is False + assert intent_ack_continuation_enabled(_agent(False, "codex_responses")) is False + + +# ── detector: workspace requirement ───────────────────────────────────────── + + +def test_codex_only_path_requires_workspace(): + a = _agent("auto", "codex_responses") + msgs = [{"role": "user", "content": CODE_USER}] + # codebase ack matches workspace markers → fires + assert looks_like_codex_intermediate_ack(a, CODE_USER, CODE_ACK, msgs, require_workspace=True) + # server-ops ack has no filesystem reference → does NOT fire (historical scope) + repro_msgs = [{"role": "user", "content": REPRO_USER}] + assert not looks_like_codex_intermediate_ack( + a, REPRO_USER, REPRO_ACK, repro_msgs, require_workspace=True + ) + + +def test_all_path_drops_workspace_requirement(): + """The #27881 fix: opted-in turns catch non-codebase intent acks.""" + a = _agent(True, "chat_completions") + msgs = [{"role": "user", "content": REPRO_USER}] + assert looks_like_codex_intermediate_ack( + a, REPRO_USER, REPRO_ACK, msgs, require_workspace=False + ) + + +# ── detector: guardrails that hold regardless of workspace ─────────────────── + + +def test_real_final_answer_does_not_fire(): + a = _agent(True, "chat_completions") + final = "Done. The server is healthy and there are no critical errors in the logs." + msgs = [{"role": "user", "content": REPRO_USER}] + assert not looks_like_codex_intermediate_ack(a, REPRO_USER, final, msgs, require_workspace=False) + + +def test_conversational_reply_without_action_verb_does_not_fire(): + a = _agent(True, "chat_completions") + brainstorm = "I'll help you think through the tradeoffs here." + msgs = [{"role": "user", "content": "help me decide"}] + assert not looks_like_codex_intermediate_ack( + a, "help me decide", brainstorm, msgs, require_workspace=False + ) + + +def test_does_not_fire_after_a_tool_already_ran(): + a = _agent(True, "chat_completions") + msgs = [ + {"role": "user", "content": REPRO_USER}, + {"role": "tool", "content": "health check result"}, + ] + assert not looks_like_codex_intermediate_ack( + a, REPRO_USER, REPRO_ACK, msgs, require_workspace=False + ) + + +def test_long_response_is_not_treated_as_an_ack(): + a = _agent(True, "chat_completions") + long_ack = "I will run the check. " + ("x" * 1300) + msgs = [{"role": "user", "content": REPRO_USER}] + assert not looks_like_codex_intermediate_ack( + a, REPRO_USER, long_ack, msgs, require_workspace=False + )