fix(goals): force judge to use tool calls instead of JSON-text replies (#23547)

Live-tested on gemini-3-flash-preview the judge kept returning empty
or non-JSON content, tripping the consecutive-parse-failures auto-
pause. Free-form JSON output is hopeful; tool-call schemas are
enforced server-side by virtually every modern provider.

Two new tools the judge calls:

  - submit_checklist(items)  — Phase A, decompose
  - update_checklist(updates, new_items, reason) — Phase B, evaluate

Both phases now call the auxiliary client with tool_choice forcing
the right tool. read_file remains for Phase B history inspection,
with the loop exiting only when update_checklist is called or the
read budget is exhausted (at which point read_file is dropped from
the toolbox and update_checklist is forced).

Robustness:
- _call_judge_with_tool_choice falls back tool_choice forced→required→
  auto if the provider rejects a particular shape.
- If a fully-broken provider still returns content instead of a tool
  call, the legacy JSON-text parsers stay around as a last-ditch
  backstop so we never silently lose a checklist.
- _normalize_update_args replaces the JSON parser for the apply
  layer; same 1-based→0-based conversion + terminal-status filter.

Live verification: same fizzbuzz goal that was hitting 'judge model
returned unparseable output 3 turns in a row' before now terminates
in 2 turns, all 11 items marked completed with item-specific
evidence, no auto-pause. Agent log shows
'produced 11 checklist items via tool call' instead of the JSON-
parse path.

Tests: 7 new cases for the tool-call path (Phase A success, Phase B
update only, Phase B read_file→update, JSON-content backstop,
empty-text item dropping, non-terminal status filter).
This commit is contained in:
Teknium 2026-05-10 20:51:40 -07:00 committed by GitHub
parent 4a080b1d5a
commit a63a2b7c78
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 708 additions and 156 deletions

View file

@ -1127,3 +1127,251 @@ class TestGoalSurvivesCompressionRotation:
# Child should still have no goal.
assert load_goal("child-no-goal") is None
# ──────────────────────────────────────────────────────────────────────
# Forced tool-call judge: submit_checklist (Phase A) + update_checklist (Phase B)
# ──────────────────────────────────────────────────────────────────────
class _FakeFn:
def __init__(self, name, args):
self.name = name
self.arguments = args if isinstance(args, str) else json.dumps(args)
class _FakeToolCall:
def __init__(self, tc_id, name, args):
self.id = tc_id
self.type = "function"
self.function = _FakeFn(name, args)
class _FakeMessage:
def __init__(self, *, content="", tool_calls=None):
self.content = content
self.tool_calls = tool_calls or []
class _FakeChoice:
def __init__(self, message):
self.message = message
class _FakeResponse:
def __init__(self, message):
self.choices = [_FakeChoice(message)]
def _make_fake_client(scripted_messages):
"""Return a fake client whose .chat.completions.create() returns the
next scripted message each call. Mutates the underlying list as a
queue so repeat calls advance.
"""
class FakeClient:
class chat:
class completions:
_queue = list(scripted_messages)
_calls = []
@classmethod
def create(cls, **kwargs):
cls._calls.append(kwargs)
if not cls._queue:
raise RuntimeError("scripted-message queue exhausted")
return _FakeResponse(cls._queue.pop(0))
return FakeClient
class TestPhaseAToolCall:
def test_decompose_via_submit_checklist_tool(self, hermes_home):
from hermes_cli import goals
from hermes_cli.goals import decompose_goal
msg = _FakeMessage(
tool_calls=[_FakeToolCall(
"tc-1", "submit_checklist",
{"items": [{"text": "first criterion"}, {"text": "second criterion"}]},
)],
)
client = _make_fake_client([msg])
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
items, err = decompose_goal("build a website")
assert err is None
assert [it["text"] for it in items] == ["first criterion", "second criterion"]
# Verify we forced the tool: tool_choice should target submit_checklist.
call = client.chat.completions._calls[0]
assert "tools" in call
assert call["tools"][0]["function"]["name"] == "submit_checklist"
# tool_choice should be either {"type":"function","function":{"name":"submit_checklist"}}
# or "required" / "auto" if a fallback was used; primary attempt forces it.
tc = call["tool_choice"]
assert (
(isinstance(tc, dict) and tc.get("function", {}).get("name") == "submit_checklist")
or tc == "required"
or tc == "auto"
)
def test_decompose_falls_back_to_json_content_when_no_tool_call(self, hermes_home):
"""If a broken provider returns content instead of a tool call, the
backstop JSON parser still salvages a checklist."""
from hermes_cli import goals
from hermes_cli.goals import decompose_goal
msg = _FakeMessage(
content='{"checklist": [{"text": "salvaged"}]}',
tool_calls=[],
)
client = _make_fake_client([msg])
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
items, err = decompose_goal("g")
assert err is None
assert items == [{"text": "salvaged"}]
def test_decompose_returns_error_when_no_tool_and_no_json(self, hermes_home):
from hermes_cli import goals
from hermes_cli.goals import decompose_goal
msg = _FakeMessage(content="I think this should be done in stages.", tool_calls=[])
client = _make_fake_client([msg])
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
items, err = decompose_goal("g")
assert items == []
assert err and "submit_checklist" in err
def test_decompose_drops_empty_text_items(self, hermes_home):
from hermes_cli import goals
from hermes_cli.goals import decompose_goal
msg = _FakeMessage(
tool_calls=[_FakeToolCall(
"tc-1", "submit_checklist",
{"items": [{"text": "ok"}, {"text": ""}, {"text": " "}, {"text": "two"}]},
)],
)
client = _make_fake_client([msg])
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
items, err = decompose_goal("g")
assert err is None
assert [it["text"] for it in items] == ["ok", "two"]
class TestPhaseBToolCall:
def test_evaluate_via_update_checklist_tool(self, hermes_home):
from hermes_cli import goals
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
state = GoalState(
goal="g",
decomposed=True,
checklist=[
ChecklistItem(text="a", status=ITEM_PENDING),
ChecklistItem(text="b", status=ITEM_PENDING),
],
)
msg = _FakeMessage(
tool_calls=[_FakeToolCall(
"tc-1", "update_checklist",
{
# 1-based indices; layer converts to 0-based.
"updates": [{"index": 1, "status": "completed", "evidence": "did a"}],
"new_items": [{"text": "discovered c"}],
"reason": "ran a",
},
)],
)
client = _make_fake_client([msg])
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
parsed, parse_failed = evaluate_checklist(
state, "did the first thing", history_path=None,
)
assert parse_failed is False
# Index converted 1 → 0
assert parsed["updates"] == [{"index": 0, "status": "completed", "evidence": "did a"}]
assert parsed["new_items"] == [{"text": "discovered c"}]
assert parsed["reason"] == "ran a"
def test_evaluate_does_read_file_then_update(self, hermes_home, tmp_path):
"""Phase-B tool loop: judge calls read_file once, then update_checklist."""
from hermes_cli import goals
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
# Make a real history file so the path-restriction check passes.
hist = tmp_path / "hist.json"
hist.write_text(json.dumps([{"role": "user", "content": "hi"}]))
state = GoalState(
goal="g",
decomposed=True,
checklist=[ChecklistItem(text="a", status=ITEM_PENDING)],
)
msg1 = _FakeMessage(tool_calls=[_FakeToolCall(
"tc-1", "read_file", {"path": str(hist), "offset": 1, "limit": 100},
)])
msg2 = _FakeMessage(tool_calls=[_FakeToolCall(
"tc-2", "update_checklist",
{
"updates": [{"index": 1, "status": "completed", "evidence": "saw it"}],
"new_items": [],
"reason": "verified via read_file",
},
)])
client = _make_fake_client([msg1, msg2])
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
parsed, parse_failed = evaluate_checklist(
state, "did the thing", history_path=hist,
)
assert parse_failed is False
assert parsed["updates"][0]["status"] == "completed"
assert parsed["reason"] == "verified via read_file"
# Two API calls — one for the read, one for the verdict.
assert len(client.chat.completions._calls) == 2
def test_evaluate_filters_non_terminal_status_in_tool_args(self, hermes_home):
"""update_checklist should only accept 'completed' or 'impossible'
any 'pending' updates are dropped at the normalize layer."""
from hermes_cli import goals
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
state = GoalState(
goal="g",
decomposed=True,
checklist=[
ChecklistItem(text="a", status=ITEM_PENDING),
ChecklistItem(text="b", status=ITEM_PENDING),
],
)
msg = _FakeMessage(tool_calls=[_FakeToolCall(
"tc-1", "update_checklist",
{
"updates": [
{"index": 1, "status": "completed", "evidence": "yes"},
{"index": 2, "status": "pending", "evidence": "skip me"},
],
"new_items": [],
"reason": "...",
},
)])
client = _make_fake_client([msg])
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
parsed, _pf = evaluate_checklist(state, "x", history_path=None)
# Only the completed flip survives; pending update is dropped silently.
assert len(parsed["updates"]) == 1
assert parsed["updates"][0]["index"] == 0