hermes-agent/tests/agent/test_intent_ack_continuation.py
Teknium d43e0cf304
fix(agent): config-driven intent-ack continuation for all api_modes (#27881) (#53943)
* fix(agent): config-driven intent-ack continuation for all api_modes (#27881)

The agent could end a turn after only stating intent ('I will run a health
check...') without executing the announced tool call, forcing the user to
re-prompt. A continuation guard that catches this and nudges the model to
proceed already existed but was hard-gated to the codex_responses api_mode,
so Gemini/Claude/OpenRouter turns never benefited.

- New agent.intent_ack_continuation config (default 'auto' = codex-only,
  byte-stable for existing conversations). 'true'/model-list opts every
  api_mode in; 'false' disables. Mirrors agent.tool_use_enforcement's shape.
- looks_like_codex_intermediate_ack gains require_workspace (default True).
  The opted-in path drops the codebase/filesystem requirement so general
  autonomous workflows (server ops, deploys, API calls) are caught, not just
  coding tasks. Future-ack + action-verb + short-content + no-prior-tool
  guards still apply; the 2-nudge-per-turn cap is unchanged.
- Resolution centralized in intent_ack_continuation_mode (off/codex_only/all).

* docs(infographic): intent-ack continuation (#27881)
2026-06-27 20:46:00 -07:00

160 lines
6.5 KiB
Python

"""Intent-ack continuation gate + detector behavior.
Covers the config-driven generalization of the codex intent-ack continuation
(issue #27881): the historical ``codex_responses``-only path is byte-stable
under the default ``"auto"`` mode, while an explicit ``true``/model-list opt-in
extends the "you announced an action but called no tool — keep going" nudge to
every api_mode and relaxes the codebase/workspace requirement so general
autonomous workflows ("I'll run a health check on the server") are caught.
These are invariant assertions about how the mode string and the detector
gates relate, not snapshots of the marker lists.
"""
from types import SimpleNamespace
from typing import Union
from agent.agent_runtime_helpers import (
intent_ack_continuation_enabled,
intent_ack_continuation_mode,
looks_like_codex_intermediate_ack,
)
def _agent(
mode: Union[str, bool, list] = "auto",
api_mode="chat_completions",
model="anthropic/claude-sonnet-4",
):
# _strip_think_blocks is a no-op for these plain-text fixtures.
return SimpleNamespace(
_intent_ack_continuation=mode,
api_mode=api_mode,
model=model,
_strip_think_blocks=lambda c: c,
)
# The reporter's exact repro (#27881): server-ops task, no filesystem reference.
REPRO_USER = (
"check the current status of the server, grab the latest error logs, "
"and let me know if there's anything critical"
)
REPRO_ACK = "I will start by running a health check command on the server to see its current status."
# The codex-coding case the detector was originally built for.
CODE_USER = "review the codebase in /app"
CODE_ACK = "Let me inspect the repository files first."
# ── mode resolution ────────────────────────────────────────────────────────
def test_auto_is_codex_only():
assert intent_ack_continuation_mode(_agent("auto", "codex_responses")) == "codex_only"
assert intent_ack_continuation_mode(_agent("auto", "chat_completions")) == "off"
assert intent_ack_continuation_mode(_agent("auto", "anthropic")) == "off"
def test_true_is_all_api_modes():
for am in ("chat_completions", "anthropic", "codex_responses"):
assert intent_ack_continuation_mode(_agent(True, am)) == "all"
for s in ("true", "always", "yes", "on", "ON"):
assert intent_ack_continuation_mode(_agent(s, "chat_completions")) == "all"
def test_false_is_off_even_for_codex():
assert intent_ack_continuation_mode(_agent(False, "codex_responses")) == "off"
for s in ("false", "never", "no", "off"):
assert intent_ack_continuation_mode(_agent(s, "codex_responses")) == "off"
def test_list_matches_model_substring():
assert intent_ack_continuation_mode(
_agent(["gemini", "qwen"], "chat_completions", "google/gemini-3-pro")
) == "all"
assert intent_ack_continuation_mode(
_agent(["gemini", "qwen"], "chat_completions", "anthropic/claude-sonnet-4")
) == "off"
def test_unrecognised_value_falls_back_to_auto():
assert intent_ack_continuation_mode(_agent("garbage", "codex_responses")) == "codex_only"
assert intent_ack_continuation_mode(_agent("garbage", "chat_completions")) == "off"
def test_missing_attr_defaults_to_auto():
bare = SimpleNamespace(api_mode="chat_completions", model="x", _strip_think_blocks=lambda c: c)
assert intent_ack_continuation_mode(bare) == "off"
bare_codex = SimpleNamespace(api_mode="codex_responses", model="x", _strip_think_blocks=lambda c: c)
assert intent_ack_continuation_mode(bare_codex) == "codex_only"
def test_enabled_is_mode_not_off():
assert intent_ack_continuation_enabled(_agent(True, "chat_completions")) is True
assert intent_ack_continuation_enabled(_agent("auto", "codex_responses")) is True
assert intent_ack_continuation_enabled(_agent("auto", "chat_completions")) is False
assert intent_ack_continuation_enabled(_agent(False, "codex_responses")) is False
# ── detector: workspace requirement ─────────────────────────────────────────
def test_codex_only_path_requires_workspace():
a = _agent("auto", "codex_responses")
msgs = [{"role": "user", "content": CODE_USER}]
# codebase ack matches workspace markers → fires
assert looks_like_codex_intermediate_ack(a, CODE_USER, CODE_ACK, msgs, require_workspace=True)
# server-ops ack has no filesystem reference → does NOT fire (historical scope)
repro_msgs = [{"role": "user", "content": REPRO_USER}]
assert not looks_like_codex_intermediate_ack(
a, REPRO_USER, REPRO_ACK, repro_msgs, require_workspace=True
)
def test_all_path_drops_workspace_requirement():
"""The #27881 fix: opted-in turns catch non-codebase intent acks."""
a = _agent(True, "chat_completions")
msgs = [{"role": "user", "content": REPRO_USER}]
assert looks_like_codex_intermediate_ack(
a, REPRO_USER, REPRO_ACK, msgs, require_workspace=False
)
# ── detector: guardrails that hold regardless of workspace ───────────────────
def test_real_final_answer_does_not_fire():
a = _agent(True, "chat_completions")
final = "Done. The server is healthy and there are no critical errors in the logs."
msgs = [{"role": "user", "content": REPRO_USER}]
assert not looks_like_codex_intermediate_ack(a, REPRO_USER, final, msgs, require_workspace=False)
def test_conversational_reply_without_action_verb_does_not_fire():
a = _agent(True, "chat_completions")
brainstorm = "I'll help you think through the tradeoffs here."
msgs = [{"role": "user", "content": "help me decide"}]
assert not looks_like_codex_intermediate_ack(
a, "help me decide", brainstorm, msgs, require_workspace=False
)
def test_does_not_fire_after_a_tool_already_ran():
a = _agent(True, "chat_completions")
msgs = [
{"role": "user", "content": REPRO_USER},
{"role": "tool", "content": "health check result"},
]
assert not looks_like_codex_intermediate_ack(
a, REPRO_USER, REPRO_ACK, msgs, require_workspace=False
)
def test_long_response_is_not_treated_as_an_ack():
a = _agent(True, "chat_completions")
long_ack = "I will run the check. " + ("x" * 1300)
msgs = [{"role": "user", "content": REPRO_USER}]
assert not looks_like_codex_intermediate_ack(
a, REPRO_USER, long_ack, msgs, require_workspace=False
)