fix(goals): force judge to use tool calls instead of JSON-text replies (#23547)

Live-tested on gemini-3-flash-preview the judge kept returning empty or non-JSON content, tripping the consecutive-parse-failures auto- pause. Free-form JSON output is hopeful; tool-call schemas are enforced server-side by virtually every modern provider. Two new tools the judge calls: - submit_checklist(items) — Phase A, decompose - update_checklist(updates, new_items, reason) — Phase B, evaluate Both phases now call the auxiliary client with tool_choice forcing the right tool. read_file remains for Phase B history inspection, with the loop exiting only when update_checklist is called or the read budget is exhausted (at which point read_file is dropped from the toolbox and update_checklist is forced). Robustness: - _call_judge_with_tool_choice falls back tool_choice forced→required→ auto if the provider rejects a particular shape. - If a fully-broken provider still returns content instead of a tool call, the legacy JSON-text parsers stay around as a last-ditch backstop so we never silently lose a checklist. - _normalize_update_args replaces the JSON parser for the apply layer; same 1-based→0-based conversion + terminal-status filter. Live verification: same fizzbuzz goal that was hitting 'judge model returned unparseable output 3 turns in a row' before now terminates in 2 turns, all 11 items marked completed with item-specific evidence, no auto-pause. Agent log shows 'produced 11 checklist items via tool call' instead of the JSON- parse path. Tests: 7 new cases for the tool-call path (Phase A success, Phase B update only, Phase B read_file→update, JSON-content backstop, empty-text item dropping, non-terminal status filter).
2026-05-18 04:41:56 +00:00 · 2026-05-10 20:51:40 -07:00 · 2026-05-10 20:51:40 -07:00 · a63a2b7c78
commit a63a2b7c78
parent 4a080b1d5a
2 changed files with 708 additions and 156 deletions
--- a/tests/hermes_cli/test_goals.py
+++ b/tests/hermes_cli/test_goals.py
@ -1127,3 +1127,251 @@ class TestGoalSurvivesCompressionRotation:

        # Child should still have no goal.
        assert load_goal("child-no-goal") is None
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Forced tool-call judge: submit_checklist (Phase A) + update_checklist (Phase B)
+# ──────────────────────────────────────────────────────────────────────
+
+
+class _FakeFn:
+    def __init__(self, name, args):
+        self.name = name
+        self.arguments = args if isinstance(args, str) else json.dumps(args)
+
+
+class _FakeToolCall:
+    def __init__(self, tc_id, name, args):
+        self.id = tc_id
+        self.type = "function"
+        self.function = _FakeFn(name, args)
+
+
+class _FakeMessage:
+    def __init__(self, *, content="", tool_calls=None):
+        self.content = content
+        self.tool_calls = tool_calls or []
+
+
+class _FakeChoice:
+    def __init__(self, message):
+        self.message = message
+
+
+class _FakeResponse:
+    def __init__(self, message):
+        self.choices = [_FakeChoice(message)]
+
+
+def _make_fake_client(scripted_messages):
+    """Return a fake client whose .chat.completions.create() returns the
+    next scripted message each call. Mutates the underlying list as a
+    queue so repeat calls advance.
+    """
+    class FakeClient:
+        class chat:
+            class completions:
+                _queue = list(scripted_messages)
+                _calls = []
+
+                @classmethod
+                def create(cls, **kwargs):
+                    cls._calls.append(kwargs)
+                    if not cls._queue:
+                        raise RuntimeError("scripted-message queue exhausted")
+                    return _FakeResponse(cls._queue.pop(0))
+
+    return FakeClient
+
+
+class TestPhaseAToolCall:
+    def test_decompose_via_submit_checklist_tool(self, hermes_home):
+        from hermes_cli import goals
+        from hermes_cli.goals import decompose_goal
+
+        msg = _FakeMessage(
+            tool_calls=[_FakeToolCall(
+                "tc-1", "submit_checklist",
+                {"items": [{"text": "first criterion"}, {"text": "second criterion"}]},
+            )],
+        )
+        client = _make_fake_client([msg])
+
+        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
+            items, err = decompose_goal("build a website")
+
+        assert err is None
+        assert [it["text"] for it in items] == ["first criterion", "second criterion"]
+        # Verify we forced the tool: tool_choice should target submit_checklist.
+        call = client.chat.completions._calls[0]
+        assert "tools" in call
+        assert call["tools"][0]["function"]["name"] == "submit_checklist"
+        # tool_choice should be either {"type":"function","function":{"name":"submit_checklist"}}
+        # or "required" / "auto" if a fallback was used; primary attempt forces it.
+        tc = call["tool_choice"]
+        assert (
+            (isinstance(tc, dict) and tc.get("function", {}).get("name") == "submit_checklist")
+            or tc == "required"
+            or tc == "auto"
+        )
+
+    def test_decompose_falls_back_to_json_content_when_no_tool_call(self, hermes_home):
+        """If a broken provider returns content instead of a tool call, the
+        backstop JSON parser still salvages a checklist."""
+        from hermes_cli import goals
+        from hermes_cli.goals import decompose_goal
+
+        msg = _FakeMessage(
+            content='{"checklist": [{"text": "salvaged"}]}',
+            tool_calls=[],
+        )
+        client = _make_fake_client([msg])
+
+        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
+            items, err = decompose_goal("g")
+
+        assert err is None
+        assert items == [{"text": "salvaged"}]
+
+    def test_decompose_returns_error_when_no_tool_and_no_json(self, hermes_home):
+        from hermes_cli import goals
+        from hermes_cli.goals import decompose_goal
+
+        msg = _FakeMessage(content="I think this should be done in stages.", tool_calls=[])
+        client = _make_fake_client([msg])
+
+        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
+            items, err = decompose_goal("g")
+
+        assert items == []
+        assert err and "submit_checklist" in err
+
+    def test_decompose_drops_empty_text_items(self, hermes_home):
+        from hermes_cli import goals
+        from hermes_cli.goals import decompose_goal
+
+        msg = _FakeMessage(
+            tool_calls=[_FakeToolCall(
+                "tc-1", "submit_checklist",
+                {"items": [{"text": "ok"}, {"text": ""}, {"text": "  "}, {"text": "two"}]},
+            )],
+        )
+        client = _make_fake_client([msg])
+
+        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
+            items, err = decompose_goal("g")
+
+        assert err is None
+        assert [it["text"] for it in items] == ["ok", "two"]
+
+
+class TestPhaseBToolCall:
+    def test_evaluate_via_update_checklist_tool(self, hermes_home):
+        from hermes_cli import goals
+        from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
+
+        state = GoalState(
+            goal="g",
+            decomposed=True,
+            checklist=[
+                ChecklistItem(text="a", status=ITEM_PENDING),
+                ChecklistItem(text="b", status=ITEM_PENDING),
+            ],
+        )
+
+        msg = _FakeMessage(
+            tool_calls=[_FakeToolCall(
+                "tc-1", "update_checklist",
+                {
+                    # 1-based indices; layer converts to 0-based.
+                    "updates": [{"index": 1, "status": "completed", "evidence": "did a"}],
+                    "new_items": [{"text": "discovered c"}],
+                    "reason": "ran a",
+                },
+            )],
+        )
+        client = _make_fake_client([msg])
+
+        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
+            parsed, parse_failed = evaluate_checklist(
+                state, "did the first thing", history_path=None,
+            )
+
+        assert parse_failed is False
+        # Index converted 1 → 0
+        assert parsed["updates"] == [{"index": 0, "status": "completed", "evidence": "did a"}]
+        assert parsed["new_items"] == [{"text": "discovered c"}]
+        assert parsed["reason"] == "ran a"
+
+    def test_evaluate_does_read_file_then_update(self, hermes_home, tmp_path):
+        """Phase-B tool loop: judge calls read_file once, then update_checklist."""
+        from hermes_cli import goals
+        from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
+
+        # Make a real history file so the path-restriction check passes.
+        hist = tmp_path / "hist.json"
+        hist.write_text(json.dumps([{"role": "user", "content": "hi"}]))
+
+        state = GoalState(
+            goal="g",
+            decomposed=True,
+            checklist=[ChecklistItem(text="a", status=ITEM_PENDING)],
+        )
+
+        msg1 = _FakeMessage(tool_calls=[_FakeToolCall(
+            "tc-1", "read_file", {"path": str(hist), "offset": 1, "limit": 100},
+        )])
+        msg2 = _FakeMessage(tool_calls=[_FakeToolCall(
+            "tc-2", "update_checklist",
+            {
+                "updates": [{"index": 1, "status": "completed", "evidence": "saw it"}],
+                "new_items": [],
+                "reason": "verified via read_file",
+            },
+        )])
+        client = _make_fake_client([msg1, msg2])
+
+        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
+            parsed, parse_failed = evaluate_checklist(
+                state, "did the thing", history_path=hist,
+            )
+
+        assert parse_failed is False
+        assert parsed["updates"][0]["status"] == "completed"
+        assert parsed["reason"] == "verified via read_file"
+        # Two API calls — one for the read, one for the verdict.
+        assert len(client.chat.completions._calls) == 2
+
+    def test_evaluate_filters_non_terminal_status_in_tool_args(self, hermes_home):
+        """update_checklist should only accept 'completed' or 'impossible' —
+        any 'pending' updates are dropped at the normalize layer."""
+        from hermes_cli import goals
+        from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
+
+        state = GoalState(
+            goal="g",
+            decomposed=True,
+            checklist=[
+                ChecklistItem(text="a", status=ITEM_PENDING),
+                ChecklistItem(text="b", status=ITEM_PENDING),
+            ],
+        )
+        msg = _FakeMessage(tool_calls=[_FakeToolCall(
+            "tc-1", "update_checklist",
+            {
+                "updates": [
+                    {"index": 1, "status": "completed", "evidence": "yes"},
+                    {"index": 2, "status": "pending", "evidence": "skip me"},
+                ],
+                "new_items": [],
+                "reason": "...",
+            },
+        )])
+        client = _make_fake_client([msg])
+
+        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
+            parsed, _pf = evaluate_checklist(state, "x", history_path=None)
+
+        # Only the completed flip survives; pending update is dropped silently.
+        assert len(parsed["updates"]) == 1
+        assert parsed["updates"][0]["index"] == 0