fix(goals): force judge to use tool calls instead of JSON-text replies (#23547)

Live-tested on gemini-3-flash-preview the judge kept returning empty or non-JSON content, tripping the consecutive-parse-failures auto- pause. Free-form JSON output is hopeful; tool-call schemas are enforced server-side by virtually every modern provider. Two new tools the judge calls: - submit_checklist(items) — Phase A, decompose - update_checklist(updates, new_items, reason) — Phase B, evaluate Both phases now call the auxiliary client with tool_choice forcing the right tool. read_file remains for Phase B history inspection, with the loop exiting only when update_checklist is called or the read budget is exhausted (at which point read_file is dropped from the toolbox and update_checklist is forced). Robustness: - _call_judge_with_tool_choice falls back tool_choice forced→required→ auto if the provider rejects a particular shape. - If a fully-broken provider still returns content instead of a tool call, the legacy JSON-text parsers stay around as a last-ditch backstop so we never silently lose a checklist. - _normalize_update_args replaces the JSON parser for the apply layer; same 1-based→0-based conversion + terminal-status filter. Live verification: same fizzbuzz goal that was hitting 'judge model returned unparseable output 3 turns in a row' before now terminates in 2 turns, all 11 items marked completed with item-specific evidence, no auto-pause. Agent log shows 'produced 11 checklist items via tool call' instead of the JSON- parse path. Tests: 7 new cases for the tool-call path (Phase A success, Phase B update only, Phase B read_file→update, JSON-content backstop, empty-text item dropping, non-terminal status filter).
2026-05-23 05:31:23 +00:00 · 2026-05-10 20:51:40 -07:00 · 2026-05-10 20:51:40 -07:00 · a63a2b7c78
commit a63a2b7c78
parent 4a080b1d5a
2 changed files with 708 additions and 156 deletions
--- a/hermes_cli/goals.py
+++ b/hermes_cli/goals.py
@ -143,8 +143,9 @@ DECOMPOSE_SYSTEM_PROMPT = (
    "publicly accessible, domain/URL is functional, etc. Better to "
    "over-specify and let a few items get marked impossible than to "
    "under-specify and let the agent declare victory early.\n\n"
-    "Reply ONLY with a single JSON object on one line:\n"
+    "Submit your checklist by calling the ``submit_checklist`` tool. Do "
-    '{"checklist": [{"text": "<item>"}, {"text": "<item>"}, ...]}'
+    "not reply with prose or JSON in your message body — call the tool. "
    "The system will not see anything you write outside the tool call."
 )
 DECOMPOSE_USER_PROMPT_TEMPLATE = (
@ -196,23 +197,20 @@ EVALUATE_SYSTEM_PROMPT_CHECKLIST = (
    "demonstrates the item cannot be achieved in this environment (NOT "
    "merely that the agent didn't try). Vague intentions ('I will do X "
    "next') do NOT count as completion.\n\n"
    "You may APPEND new checklist items if the agent's work reveals "
    "criteria the original decomposition missed. Stay strict — only add "
    "items that genuinely belong as completion criteria.\n\n"
    "STICKINESS: items already marked completed or impossible are frozen. "
    "Do not include them in your updates. Only the user can revert them.\n\n"
-    "TOOLS: you have read_file(path, offset, limit) available. The full "
+    "TOOLS:\n"
-    "conversation history for this session is dumped to a JSON file whose "
+    "- ``read_file(path, offset, limit)``: inspect the dumped conversation "
-    "path is given in the user message. Call read_file on it when the "
+    "history file whose path is given in the user message. Use this when "
-    "snippet is ambiguous, when you need to verify a tool call actually "
+    "the snippet alone isn't enough to rule. Each call costs tokens, so "
-    "happened, or when you want to see what skills the agent loaded. "
+    "only read when needed.\n"
-    "Otherwise, judge from the snippet directly — extra reads cost tokens.\n\n"
+    "- ``update_checklist(updates, new_items, reason)``: issue your "
-    "When you are ready to rule, reply ONLY with a single JSON object:\n"
+    "verdict. Call this exactly once per turn when you are ready to rule. "
-    '{"updates": [{"index": <i>, "status": "completed|impossible", "evidence": "<why>"}, ...], '
+    "Calling it ENDS the evaluation.\n\n"
-    '"new_items": [{"text": "<new item>"}, ...], '
+    "You MUST call one of these tools every turn. Do not reply with "
-    '"reason": "<one-sentence overall rationale>"}\n'
+    "prose or JSON in your message body — the system will not see "
-    "When citing evidence, reference the agent's actual output specifically. "
+    "anything written outside tool calls. When you cite evidence, "
-    "Empty updates is fine. Empty new_items is fine. The reason field is required."
+    "reference the agent's actual output specifically."
 )
 EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE = (
@ -645,6 +643,17 @@ def _parse_evaluate_response(raw: str) -> Tuple[Dict[str, Any], bool]:
 # ──────────────────────────────────────────────────────────────────────
 # ──────────────────────────────────────────────────────────────────────
 # Judge tool schemas: read_file (history inspection) +
 # submit_checklist (Phase A) + update_checklist (Phase B)
 #
 # Forcing the judge to emit through tool calls is dramatically more
 # reliable than asking it to reply with JSON text. Most providers
 # enforce the schema server-side, so weak/small judge models can no
 # longer drift into prose, markdown fences, or empty bodies.
 # ──────────────────────────────────────────────────────────────────────
 _JUDGE_READ_FILE_TOOL_SCHEMA: Dict[str, Any] = {
    "type": "function",
    "function": {
@ -685,6 +694,122 @@ _JUDGE_READ_FILE_TOOL_SCHEMA: Dict[str, Any] = {
 }
 _JUDGE_SUBMIT_CHECKLIST_TOOL_SCHEMA: Dict[str, Any] = {
    "type": "function",
    "function": {
        "name": "submit_checklist",
        "description": (
            "Submit the harsh, detailed completion-criteria checklist you "
            "decomposed the goal into. Each item is one verifiable "
            "completion criterion. Bias toward more items, not fewer."
        ),
        "parameters": {
            "type": "object",
            "properties": {
                "items": {
                    "type": "array",
                    "description": (
                        "List of checklist items. Each item is a single "
                        "verifiable statement of fact about the finished "
                        "work. Aim for at least 5 items; more is better "
                        "when warranted."
                    ),
                    "items": {
                        "type": "object",
                        "properties": {
                            "text": {
                                "type": "string",
                                "description": "The completion-criterion text.",
                            },
                        },
                        "required": ["text"],
                    },
                },
            },
            "required": ["items"],
        },
    },
 }
 _JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA: Dict[str, Any] = {
    "type": "function",
    "function": {
        "name": "update_checklist",
        "description": (
            "Issue your verdict on the current checklist. For each "
            "currently-pending item, decide whether the agent's most "
            "recent response (and conversation history if you read it) "
            "shows the item is satisfied. You may also append new items "
            "the original decomposition missed. Call this exactly once "
            "when you are ready to rule — calling it ends the evaluation."
        ),
        "parameters": {
            "type": "object",
            "properties": {
                "updates": {
                    "type": "array",
                    "description": (
                        "Per-item rulings. Use the 1-based ``index`` shown "
                        "in the checklist. ``status`` must be 'completed' "
                        "(clear evidence the item is done) or 'impossible' "
                        "(item cannot be achieved in this environment). "
                        "Items already in a terminal status are frozen — "
                        "do not include them."
                    ),
                    "items": {
                        "type": "object",
                        "properties": {
                            "index": {
                                "type": "integer",
                                "description": "1-based checklist index.",
                            },
                            "status": {
                                "type": "string",
                                "enum": ["completed", "impossible"],
                            },
                            "evidence": {
                                "type": "string",
                                "description": (
                                    "One-sentence specific citation of why "
                                    "this item is done or impossible. "
                                    "Reference the agent's actual output."
                                ),
                            },
                        },
                        "required": ["index", "status", "evidence"],
                    },
                },
                "new_items": {
                    "type": "array",
                    "description": (
                        "Optional: completion criteria the original "
                        "decomposition missed. Stay strict — only add "
                        "items that genuinely belong as completion "
                        "criteria for this goal."
                    ),
                    "items": {
                        "type": "object",
                        "properties": {
                            "text": {
                                "type": "string",
                                "description": "The new criterion text.",
                            },
                        },
                        "required": ["text"],
                    },
                },
                "reason": {
                    "type": "string",
                    "description": "One-sentence overall rationale for this round of updates.",
                },
            },
            "required": ["updates", "new_items", "reason"],
        },
    },
 }
 def _judge_read_file(
    path: str,
    *,
@ -770,15 +895,125 @@ def _get_judge_client() -> Tuple[Optional[Any], str]:
    return client, model
 def _extract_tool_call(msg: Any, tool_name: str) -> Optional[Dict[str, Any]]:
    """Find a tool call by name on a chat-completions message. Returns
    ``{"id", "name", "arguments": <dict>}`` or None.
    Robust to provider shims that return tool_calls as objects or dicts
    and arguments as JSON strings or already-parsed dicts.
    """
    tool_calls = getattr(msg, "tool_calls", None) or []
    for tc in tool_calls:
        try:
            tc_id = getattr(tc, "id", None) or (tc.get("id") if isinstance(tc, dict) else None) or "tc-?"
            fn = getattr(tc, "function", None) or (tc.get("function") if isinstance(tc, dict) else None)
            if fn is None:
                continue
            fn_name = getattr(fn, "name", None) or (fn.get("name") if isinstance(fn, dict) else "")
            if fn_name != tool_name:
                continue
            fn_args_raw = getattr(fn, "arguments", None) or (fn.get("arguments") if isinstance(fn, dict) else "")
            if isinstance(fn_args_raw, str):
                try:
                    args = json.loads(fn_args_raw) if fn_args_raw else {}
                except Exception:
                    args = {}
            elif isinstance(fn_args_raw, dict):
                args = fn_args_raw
            else:
                args = {}
            return {"id": tc_id, "name": fn_name, "arguments": args}
        except Exception:
            continue
    return None
 def _serialize_assistant_tool_calls(msg: Any) -> List[Dict[str, Any]]:
    """Convert a provider-shim tool_calls list into plain-dict form for
    inclusion in subsequent ``messages=[...]`` payloads."""
    out: List[Dict[str, Any]] = []
    for tc in getattr(msg, "tool_calls", None) or []:
        try:
            tc_id = getattr(tc, "id", None) or (tc.get("id") if isinstance(tc, dict) else None) or "tc-?"
            fn = getattr(tc, "function", None) or (tc.get("function") if isinstance(tc, dict) else None)
            fn_name = getattr(fn, "name", None) or (fn.get("name") if isinstance(fn, dict) else "")
            fn_args = getattr(fn, "arguments", None) or (fn.get("arguments") if isinstance(fn, dict) else "")
            if not isinstance(fn_args, str):
                try:
                    fn_args = json.dumps(fn_args)
                except Exception:
                    fn_args = "{}"
            out.append({
                "id": tc_id,
                "type": "function",
                "function": {"name": fn_name or "", "arguments": fn_args},
            })
        except Exception:
            continue
    return out
 def _call_judge_with_tool_choice(
    client: Any,
    *,
    model: str,
    messages: List[Dict[str, Any]],
    tools: List[Dict[str, Any]],
    forced_tool_name: Optional[str],
    timeout: float,
    max_tokens: int = 1500,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Call the judge with a forced tool choice, falling back to ``auto``
    if the provider rejects ``required`` / a specific function choice.
    Returns ``(response, error)``. On success, ``error`` is None.
    """
    # First attempt: force the specific tool. Most modern providers
    # support {"type": "function", "function": {"name": "..."}}.
    primary_choice: Any
    if forced_tool_name:
        primary_choice = {"type": "function", "function": {"name": forced_tool_name}}
    else:
        primary_choice = "required"
    attempts: List[Any] = [primary_choice, "required", "auto"]
    last_err: Optional[str] = None
    for choice in attempts:
        try:
            return client.chat.completions.create(
                model=model,
                messages=messages,
                tools=tools,
                tool_choice=choice,
                temperature=0,
                max_tokens=max_tokens,
                timeout=timeout,
            ), None
        except Exception as exc:
            last_err = f"{type(exc).__name__}: {exc}"
            # Only retry on errors that look like the provider rejecting the
            # tool_choice shape. Network errors etc. should bail immediately.
            msg = str(exc).lower()
            if not any(token in msg for token in (
                "tool_choice", "tool choice", "required", "function call",
                "unsupported", "not supported", "invalid", "400",
            )):
                return None, last_err
            logger.debug("goal judge: tool_choice=%r rejected (%s); falling back", choice, exc)
            continue
    return None, last_err or "all tool_choice fallbacks failed"
 def decompose_goal(
    goal: str,
    *,
    timeout: float = DEFAULT_JUDGE_TIMEOUT,
 ) -> Tuple[List[Dict[str, Any]], Optional[str]]:
-    """Phase-A: ask the judge to break the goal into a checklist.
+    """Phase-A: ask the judge to break the goal into a checklist via a
    forced ``submit_checklist`` tool call.
-    Returns ``(items, error)``. On any failure, returns ``([], reason)`` so
+    Returns ``(items, error)``. On any failure, returns ``([], reason)``
-    the caller can decide whether to fall back to freeform mode.
+    so the caller can fall back to freeform mode.
    """
    if not goal.strip():
        return [], "empty goal"
@ -787,39 +1022,68 @@ def decompose_goal(
    if client is None:
        return [], "auxiliary client unavailable"
-    try:
+    messages = [
-        resp = client.chat.completions.create(
+        {"role": "system", "content": DECOMPOSE_SYSTEM_PROMPT},
-            model=model,
+        {
-            messages=[
+            "role": "user",
-                {"role": "system", "content": DECOMPOSE_SYSTEM_PROMPT},
+            "content": DECOMPOSE_USER_PROMPT_TEMPLATE.format(
-                {
+                goal=_truncate(goal, 4000)
-                    "role": "user",
+            ),
-                    "content": DECOMPOSE_USER_PROMPT_TEMPLATE.format(
+        },
-                        goal=_truncate(goal, 4000)
+    ]
-                    ),
+
-                },
+    resp, err = _call_judge_with_tool_choice(
-            ],
+        client,
-            temperature=0,
+        model=model,
-            max_tokens=2000,
+        messages=messages,
-            timeout=timeout,
+        tools=[_JUDGE_SUBMIT_CHECKLIST_TOOL_SCHEMA],
-        )
+        forced_tool_name="submit_checklist",
-    except Exception as exc:
+        timeout=timeout,
-        logger.info("goal decompose: API call failed (%s)", exc)
+        max_tokens=2000,
-        return [], f"decompose error: {type(exc).__name__}"
+    )
    if resp is None:
        logger.info("goal decompose: API call failed (%s)", err)
        return [], f"decompose error: {err}"
    try:
-        raw = resp.choices[0].message.content or ""
+        msg = resp.choices[0].message
    except Exception:
-        raw = ""
+        return [], "decompose response malformed"
-    items, parse_failed = _parse_decompose_response(raw)
+    tc = _extract_tool_call(msg, "submit_checklist")
-    if parse_failed or not items:
+    if tc is None:
-        logger.info(
+        # Provider responded but didn't call the tool. Try parsing content
-            "goal decompose: parse failed or empty checklist (raw=%r)",
+        # as a last-ditch backstop so a fully-broken provider doesn't
-            _truncate(raw, 200),
+        # silently leave the user with no checklist at all.
-        )
+        content = getattr(msg, "content", "") or ""
-        return [], "decompose parse failed or empty"
+        items, parse_failed = _parse_decompose_response(content)
-    logger.info("goal decompose: produced %d checklist items", len(items))
+        if parse_failed or not items:
            logger.info(
                "goal decompose: no submit_checklist tool call AND no parseable JSON (raw=%r)",
                _truncate(content, 200),
            )
            return [], "decompose: judge did not call submit_checklist"
        logger.info("goal decompose: fell back to JSON-content parser (%d items)", len(items))
        return items, None
    raw_items = tc["arguments"].get("items") or []
    items: List[Dict[str, Any]] = []
    if isinstance(raw_items, list):
        for entry in raw_items:
            if isinstance(entry, dict):
                text = str(entry.get("text", "")).strip()
                if text:
                    items.append({"text": text})
            elif isinstance(entry, str):
                text = entry.strip()
                if text:
                    items.append({"text": text})
    if not items:
        logger.info("goal decompose: submit_checklist returned empty items list")
        return [], "decompose: empty checklist"
    logger.info("goal decompose: produced %d checklist items via tool call", len(items))
    return items, None
@ -882,10 +1146,15 @@ def evaluate_checklist(
    timeout: float = DEFAULT_JUDGE_TIMEOUT,
    max_tool_calls: int = DEFAULT_MAX_JUDGE_TOOL_CALLS,
 ) -> Tuple[Dict[str, Any], bool]:
-    """Phase-B: judge evaluates each pending checklist item.
+    """Phase-B: judge evaluates each pending checklist item via forced
    tool calls.
-    Runs a bounded tool loop so the judge can call ``read_file`` on the
+    The judge has two tools available:
-    dumped conversation history when the snippet isn't enough.
+      - ``read_file``: inspect the dumped conversation history
      - ``update_checklist``: issue the verdict (terminates the loop)
    ``tool_choice="required"`` forces one of them every iteration. We loop
    until ``update_checklist`` is called or ``max_tool_calls`` is exhausted.
    Returns ``(parsed, parse_failed)`` where parsed is
    ``{"updates": [...], "new_items": [...], "reason": str}``.
@ -895,7 +1164,8 @@ def evaluate_checklist(
    if client is None:
        return ({"updates": [], "new_items": [], "reason": "auxiliary client unavailable"}, False)
-    # Render checklist with 1-based indices the judge can address.
+    # Render checklist with 1-based indices the judge addresses via the
    # update_checklist tool's ``index`` field.
    checklist_block = state.render_checklist(numbered=True)
    user_prompt = EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE.format(
@ -910,136 +1180,170 @@ def evaluate_checklist(
        {"role": "user", "content": user_prompt},
    ]
-    # Some auxiliary providers may not support tool calls. We pass tools
+    # Build the toolbox: read_file is only useful when we actually have a
-    # optimistically; if the provider returns a verdict directly without
+    # history file to read, so we omit it otherwise to keep the schema lean.
-    # using them, we just parse it.
+    tools: List[Dict[str, Any]] = [_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA]
-    tools = [_JUDGE_READ_FILE_TOOL_SCHEMA] if history_path is not None else None
+    if history_path is not None:
        tools.insert(0, _JUDGE_READ_FILE_TOOL_SCHEMA)
-    tool_calls_left = max(0, int(max_tool_calls))
+    reads_left = max(0, int(max_tool_calls)) if history_path is not None else 0
    final_raw = ""
-    for _ in range(tool_calls_left + 1):
+    # Bound the overall loop generously — the judge will normally finish in
-        try:
+    # one or two passes (read_file once, then update_checklist; or just
-            kwargs: Dict[str, Any] = {
+    # update_checklist directly).
-                "model": model,
+    for iteration in range(reads_left + 2):
-                "messages": messages,
+        # When out of read budget, drop read_file from the toolbox so the
-                "temperature": 0,
+        # judge MUST emit update_checklist.
-                "max_tokens": 1500,
+        loop_tools = tools if reads_left > 0 else [_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA]
-                "timeout": timeout,
+        # Forcing update_checklist directly when reads are exhausted gives
-            }
+        # us the strongest guarantee of termination.
-            if tools:
+        forced = "update_checklist" if reads_left <= 0 else None
-                kwargs["tools"] = tools
+
-                kwargs["tool_choice"] = "auto"
+        resp, err = _call_judge_with_tool_choice(
-            resp = client.chat.completions.create(**kwargs)
+            client,
-        except Exception as exc:
+            model=model,
-            logger.info("goal judge (checklist): API call failed (%s)", exc)
+            messages=messages,
            tools=loop_tools,
            forced_tool_name=forced,
            timeout=timeout,
            max_tokens=1500,
        )
        if resp is None:
            logger.info("goal judge (checklist): API call failed (%s)", err)
            return (
                {
                    "updates": [],
                    "new_items": [],
-                    "reason": f"judge error: {type(exc).__name__}",
+                    "reason": f"judge error: {err}",
                },
                False,
            )
        try:
-            choice = resp.choices[0]
+            msg = resp.choices[0].message
            msg = choice.message
        except Exception:
            return (
                {"updates": [], "new_items": [], "reason": "judge response malformed"},
                True,
            )
-        # Unpack tool_calls in a way that works for openai-py and other shims.
+        # Did the judge call update_checklist? If yes, we're done.
-        tool_calls = getattr(msg, "tool_calls", None) or []
+        update_tc = _extract_tool_call(msg, "update_checklist")
-        content = getattr(msg, "content", "") or ""
+        if update_tc is not None:
            parsed = _normalize_update_args(update_tc["arguments"])
            logger.info(
                "goal judge (checklist): updates=%d new_items=%d reason=%s",
                len(parsed.get("updates") or []),
                len(parsed.get("new_items") or []),
                _truncate(parsed.get("reason", ""), 120),
            )
            return parsed, False
-        if not tool_calls:
+        # Did the judge call read_file? If yes, run it and feed the result back.
-            final_raw = content
+        read_tc = _extract_tool_call(msg, "read_file")
-            break
+        if read_tc is not None and reads_left > 0:
-
+            args = read_tc["arguments"]
-        if tool_calls_left <= 0:
+            tool_result = _judge_read_file(
-            # Out of budget. Force a final ruling on the next pass by
+                str(args.get("path", "")),
-            # appending a system note and disabling tools.
+                offset=args.get("offset", 1),
                limit=args.get("limit", _JUDGE_READ_FILE_MAX_LINES),
                allowed_path=history_path,
            )
            messages.append({
-                "role": "user",
+                "role": "assistant",
-                "content": (
+                "content": getattr(msg, "content", "") or "",
-                    "You have exhausted your read_file budget. Issue your "
+                "tool_calls": _serialize_assistant_tool_calls(msg),
                    "final JSON verdict now without calling more tools."
                ),
            })
            tools = None
            continue
        # Append the assistant turn, then handle each tool call.
        assistant_record: Dict[str, Any] = {
            "role": "assistant",
            "content": content,
            "tool_calls": [],
        }
        for tc in tool_calls:
            try:
                tc_id = getattr(tc, "id", None) or "tc-?"
                fn = getattr(tc, "function", None)
                fn_name = getattr(fn, "name", "") if fn is not None else ""
                fn_args = getattr(fn, "arguments", "") if fn is not None else ""
                assistant_record["tool_calls"].append({
                    "id": tc_id,
                    "type": "function",
                    "function": {"name": fn_name, "arguments": fn_args},
                })
            except Exception:
                continue
        messages.append(assistant_record)
        for tc in tool_calls:
            try:
                tc_id = getattr(tc, "id", None) or "tc-?"
                fn = getattr(tc, "function", None)
                fn_name = getattr(fn, "name", "") if fn is not None else ""
                fn_args_raw = getattr(fn, "arguments", "") if fn is not None else ""
            except Exception:
                continue
            try:
                args = json.loads(fn_args_raw) if isinstance(fn_args_raw, str) else (fn_args_raw or {})
            except Exception:
                args = {}
            if fn_name == "read_file":
                tool_result = _judge_read_file(
                    str(args.get("path", "")),
                    offset=args.get("offset", 1),
                    limit=args.get("limit", _JUDGE_READ_FILE_MAX_LINES),
                    allowed_path=history_path,
                )
            else:
                tool_result = json.dumps({"error": f"unknown tool: {fn_name}"})
            messages.append({
                "role": "tool",
-                "tool_call_id": tc_id,
+                "tool_call_id": read_tc["id"],
-                "name": fn_name,
+                "name": "read_file",
                "content": tool_result,
            })
-            tool_calls_left -= 1
+            reads_left -= 1
            continue
-        if tool_calls_left <= 0:
+        # Neither tool was called. Try parsing the content body as a last-
-            messages.append({
+        # ditch backstop, then bail.
-                "role": "user",
+        content = getattr(msg, "content", "") or ""
-                "content": (
+        if content.strip():
-                    "You have exhausted your read_file budget. Issue your "
+            parsed, parse_failed = _parse_evaluate_response(content)
-                    "final JSON verdict now without calling more tools."
+            if not parse_failed:
-                ),
+                logger.info(
-            })
+                    "goal judge (checklist): fell back to JSON-content parser "
-            tools = None
+                    "updates=%d new_items=%d",
                    len(parsed.get("updates") or []),
                    len(parsed.get("new_items") or []),
                )
                return parsed, False
        logger.info(
            "goal judge (checklist): judge emitted neither read_file nor "
            "update_checklist (iteration=%d, content=%r) — bailing",
            iteration, _truncate(content, 120),
        )
        return (
            {
                "updates": [],
                "new_items": [],
                "reason": "judge did not call update_checklist",
            },
            True,
        )
-    parsed, parse_failed = _parse_evaluate_response(final_raw)
+    # Loop exhausted without an update_checklist call.
-    logger.info(
+    return (
-        "goal judge (checklist): updates=%d new_items=%d reason=%s",
+        {
-        len(parsed.get("updates") or []),
+            "updates": [],
-        len(parsed.get("new_items") or []),
+            "new_items": [],
-        _truncate(parsed.get("reason", ""), 120),
+            "reason": "judge tool-loop exhausted without verdict",
        },
        True,
    )
-    return parsed, parse_failed
+
 def _normalize_update_args(args: Dict[str, Any]) -> Dict[str, Any]:
    """Validate and normalize the ``update_checklist`` tool arguments.
    Performs the same 1-based → 0-based conversion and terminal-status
    filter as ``_parse_evaluate_response``. Returns the canonical
    ``{updates, new_items, reason}`` shape callers expect.
    """
    raw_updates = args.get("updates") or []
    raw_new = args.get("new_items") or []
    reason = str(args.get("reason") or "").strip() or "no reason provided"
    norm_updates: List[Dict[str, Any]] = []
    if isinstance(raw_updates, list):
        for upd in raw_updates:
            if not isinstance(upd, dict):
                continue
            try:
                idx_1based = int(upd.get("index"))
            except (TypeError, ValueError):
                continue
            status = str(upd.get("status", "")).strip().lower()
            if status not in TERMINAL_ITEM_STATUSES:
                continue
            evidence = str(upd.get("evidence") or "").strip() or None
            norm_updates.append({
                "index": idx_1based - 1,  # 1-based → 0-based for apply layer
                "status": status,
                "evidence": evidence,
            })
    norm_new: List[Dict[str, Any]] = []
    if isinstance(raw_new, list):
        for it in raw_new:
            if isinstance(it, dict):
                text = str(it.get("text", "")).strip()
                if text:
                    norm_new.append({"text": text})
            elif isinstance(it, str):
                text = it.strip()
                if text:
                    norm_new.append({"text": text})
    return {"updates": norm_updates, "new_items": norm_new, "reason": reason}
 # ──────────────────────────────────────────────────────────────────────
--- a/tests/hermes_cli/test_goals.py
+++ b/tests/hermes_cli/test_goals.py
@ -1127,3 +1127,251 @@ class TestGoalSurvivesCompressionRotation:
        # Child should still have no goal.
        assert load_goal("child-no-goal") is None
 # ──────────────────────────────────────────────────────────────────────
 # Forced tool-call judge: submit_checklist (Phase A) + update_checklist (Phase B)
 # ──────────────────────────────────────────────────────────────────────
 class _FakeFn:
    def __init__(self, name, args):
        self.name = name
        self.arguments = args if isinstance(args, str) else json.dumps(args)
 class _FakeToolCall:
    def __init__(self, tc_id, name, args):
        self.id = tc_id
        self.type = "function"
        self.function = _FakeFn(name, args)
 class _FakeMessage:
    def __init__(self, *, content="", tool_calls=None):
        self.content = content
        self.tool_calls = tool_calls or []
 class _FakeChoice:
    def __init__(self, message):
        self.message = message
 class _FakeResponse:
    def __init__(self, message):
        self.choices = [_FakeChoice(message)]
 def _make_fake_client(scripted_messages):
    """Return a fake client whose .chat.completions.create() returns the
    next scripted message each call. Mutates the underlying list as a
    queue so repeat calls advance.
    """
    class FakeClient:
        class chat:
            class completions:
                _queue = list(scripted_messages)
                _calls = []
                @classmethod
                def create(cls, **kwargs):
                    cls._calls.append(kwargs)
                    if not cls._queue:
                        raise RuntimeError("scripted-message queue exhausted")
                    return _FakeResponse(cls._queue.pop(0))
    return FakeClient
 class TestPhaseAToolCall:
    def test_decompose_via_submit_checklist_tool(self, hermes_home):
        from hermes_cli import goals
        from hermes_cli.goals import decompose_goal
        msg = _FakeMessage(
            tool_calls=[_FakeToolCall(
                "tc-1", "submit_checklist",
                {"items": [{"text": "first criterion"}, {"text": "second criterion"}]},
            )],
        )
        client = _make_fake_client([msg])
        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
            items, err = decompose_goal("build a website")
        assert err is None
        assert [it["text"] for it in items] == ["first criterion", "second criterion"]
        # Verify we forced the tool: tool_choice should target submit_checklist.
        call = client.chat.completions._calls[0]
        assert "tools" in call
        assert call["tools"][0]["function"]["name"] == "submit_checklist"
        # tool_choice should be either {"type":"function","function":{"name":"submit_checklist"}}
        # or "required" / "auto" if a fallback was used; primary attempt forces it.
        tc = call["tool_choice"]
        assert (
            (isinstance(tc, dict) and tc.get("function", {}).get("name") == "submit_checklist")
            or tc == "required"
            or tc == "auto"
        )
    def test_decompose_falls_back_to_json_content_when_no_tool_call(self, hermes_home):
        """If a broken provider returns content instead of a tool call, the
        backstop JSON parser still salvages a checklist."""
        from hermes_cli import goals
        from hermes_cli.goals import decompose_goal
        msg = _FakeMessage(
            content='{"checklist": [{"text": "salvaged"}]}',
            tool_calls=[],
        )
        client = _make_fake_client([msg])
        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
            items, err = decompose_goal("g")
        assert err is None
        assert items == [{"text": "salvaged"}]
    def test_decompose_returns_error_when_no_tool_and_no_json(self, hermes_home):
        from hermes_cli import goals
        from hermes_cli.goals import decompose_goal
        msg = _FakeMessage(content="I think this should be done in stages.", tool_calls=[])
        client = _make_fake_client([msg])
        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
            items, err = decompose_goal("g")
        assert items == []
        assert err and "submit_checklist" in err
    def test_decompose_drops_empty_text_items(self, hermes_home):
        from hermes_cli import goals
        from hermes_cli.goals import decompose_goal
        msg = _FakeMessage(
            tool_calls=[_FakeToolCall(
                "tc-1", "submit_checklist",
                {"items": [{"text": "ok"}, {"text": ""}, {"text": "  "}, {"text": "two"}]},
            )],
        )
        client = _make_fake_client([msg])
        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
            items, err = decompose_goal("g")
        assert err is None
        assert [it["text"] for it in items] == ["ok", "two"]
 class TestPhaseBToolCall:
    def test_evaluate_via_update_checklist_tool(self, hermes_home):
        from hermes_cli import goals
        from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
        state = GoalState(
            goal="g",
            decomposed=True,
            checklist=[
                ChecklistItem(text="a", status=ITEM_PENDING),
                ChecklistItem(text="b", status=ITEM_PENDING),
            ],
        )
        msg = _FakeMessage(
            tool_calls=[_FakeToolCall(
                "tc-1", "update_checklist",
                {
                    # 1-based indices; layer converts to 0-based.
                    "updates": [{"index": 1, "status": "completed", "evidence": "did a"}],
                    "new_items": [{"text": "discovered c"}],
                    "reason": "ran a",
                },
            )],
        )
        client = _make_fake_client([msg])
        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
            parsed, parse_failed = evaluate_checklist(
                state, "did the first thing", history_path=None,
            )
        assert parse_failed is False
        # Index converted 1 → 0
        assert parsed["updates"] == [{"index": 0, "status": "completed", "evidence": "did a"}]
        assert parsed["new_items"] == [{"text": "discovered c"}]
        assert parsed["reason"] == "ran a"
    def test_evaluate_does_read_file_then_update(self, hermes_home, tmp_path):
        """Phase-B tool loop: judge calls read_file once, then update_checklist."""
        from hermes_cli import goals
        from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
        # Make a real history file so the path-restriction check passes.
        hist = tmp_path / "hist.json"
        hist.write_text(json.dumps([{"role": "user", "content": "hi"}]))
        state = GoalState(
            goal="g",
            decomposed=True,
            checklist=[ChecklistItem(text="a", status=ITEM_PENDING)],
        )
        msg1 = _FakeMessage(tool_calls=[_FakeToolCall(
            "tc-1", "read_file", {"path": str(hist), "offset": 1, "limit": 100},
        )])
        msg2 = _FakeMessage(tool_calls=[_FakeToolCall(
            "tc-2", "update_checklist",
            {
                "updates": [{"index": 1, "status": "completed", "evidence": "saw it"}],
                "new_items": [],
                "reason": "verified via read_file",
            },
        )])
        client = _make_fake_client([msg1, msg2])
        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
            parsed, parse_failed = evaluate_checklist(
                state, "did the thing", history_path=hist,
            )
        assert parse_failed is False
        assert parsed["updates"][0]["status"] == "completed"
        assert parsed["reason"] == "verified via read_file"
        # Two API calls — one for the read, one for the verdict.
        assert len(client.chat.completions._calls) == 2
    def test_evaluate_filters_non_terminal_status_in_tool_args(self, hermes_home):
        """update_checklist should only accept 'completed' or 'impossible' —
        any 'pending' updates are dropped at the normalize layer."""
        from hermes_cli import goals
        from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
        state = GoalState(
            goal="g",
            decomposed=True,
            checklist=[
                ChecklistItem(text="a", status=ITEM_PENDING),
                ChecklistItem(text="b", status=ITEM_PENDING),
            ],
        )
        msg = _FakeMessage(tool_calls=[_FakeToolCall(
            "tc-1", "update_checklist",
            {
                "updates": [
                    {"index": 1, "status": "completed", "evidence": "yes"},
                    {"index": 2, "status": "pending", "evidence": "skip me"},
                ],
                "new_items": [],
                "reason": "...",
            },
        )])
        client = _make_fake_client([msg])
        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
            parsed, _pf = evaluate_checklist(state, "x", history_path=None)
        # Only the completed flip survives; pending update is dropped silently.
        assert len(parsed["updates"]) == 1
        assert parsed["updates"][0]["index"] == 0