mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(goals): force judge to use tool calls instead of JSON-text replies (#23547)
Live-tested on gemini-3-flash-preview the judge kept returning empty or non-JSON content, tripping the consecutive-parse-failures auto- pause. Free-form JSON output is hopeful; tool-call schemas are enforced server-side by virtually every modern provider. Two new tools the judge calls: - submit_checklist(items) — Phase A, decompose - update_checklist(updates, new_items, reason) — Phase B, evaluate Both phases now call the auxiliary client with tool_choice forcing the right tool. read_file remains for Phase B history inspection, with the loop exiting only when update_checklist is called or the read budget is exhausted (at which point read_file is dropped from the toolbox and update_checklist is forced). Robustness: - _call_judge_with_tool_choice falls back tool_choice forced→required→ auto if the provider rejects a particular shape. - If a fully-broken provider still returns content instead of a tool call, the legacy JSON-text parsers stay around as a last-ditch backstop so we never silently lose a checklist. - _normalize_update_args replaces the JSON parser for the apply layer; same 1-based→0-based conversion + terminal-status filter. Live verification: same fizzbuzz goal that was hitting 'judge model returned unparseable output 3 turns in a row' before now terminates in 2 turns, all 11 items marked completed with item-specific evidence, no auto-pause. Agent log shows 'produced 11 checklist items via tool call' instead of the JSON- parse path. Tests: 7 new cases for the tool-call path (Phase A success, Phase B update only, Phase B read_file→update, JSON-content backstop, empty-text item dropping, non-terminal status filter).
This commit is contained in:
parent
4a080b1d5a
commit
a63a2b7c78
2 changed files with 708 additions and 156 deletions
|
|
@ -1127,3 +1127,251 @@ class TestGoalSurvivesCompressionRotation:
|
|||
|
||||
# Child should still have no goal.
|
||||
assert load_goal("child-no-goal") is None
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Forced tool-call judge: submit_checklist (Phase A) + update_checklist (Phase B)
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _FakeFn:
|
||||
def __init__(self, name, args):
|
||||
self.name = name
|
||||
self.arguments = args if isinstance(args, str) else json.dumps(args)
|
||||
|
||||
|
||||
class _FakeToolCall:
|
||||
def __init__(self, tc_id, name, args):
|
||||
self.id = tc_id
|
||||
self.type = "function"
|
||||
self.function = _FakeFn(name, args)
|
||||
|
||||
|
||||
class _FakeMessage:
|
||||
def __init__(self, *, content="", tool_calls=None):
|
||||
self.content = content
|
||||
self.tool_calls = tool_calls or []
|
||||
|
||||
|
||||
class _FakeChoice:
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
def __init__(self, message):
|
||||
self.choices = [_FakeChoice(message)]
|
||||
|
||||
|
||||
def _make_fake_client(scripted_messages):
|
||||
"""Return a fake client whose .chat.completions.create() returns the
|
||||
next scripted message each call. Mutates the underlying list as a
|
||||
queue so repeat calls advance.
|
||||
"""
|
||||
class FakeClient:
|
||||
class chat:
|
||||
class completions:
|
||||
_queue = list(scripted_messages)
|
||||
_calls = []
|
||||
|
||||
@classmethod
|
||||
def create(cls, **kwargs):
|
||||
cls._calls.append(kwargs)
|
||||
if not cls._queue:
|
||||
raise RuntimeError("scripted-message queue exhausted")
|
||||
return _FakeResponse(cls._queue.pop(0))
|
||||
|
||||
return FakeClient
|
||||
|
||||
|
||||
class TestPhaseAToolCall:
|
||||
def test_decompose_via_submit_checklist_tool(self, hermes_home):
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import decompose_goal
|
||||
|
||||
msg = _FakeMessage(
|
||||
tool_calls=[_FakeToolCall(
|
||||
"tc-1", "submit_checklist",
|
||||
{"items": [{"text": "first criterion"}, {"text": "second criterion"}]},
|
||||
)],
|
||||
)
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
items, err = decompose_goal("build a website")
|
||||
|
||||
assert err is None
|
||||
assert [it["text"] for it in items] == ["first criterion", "second criterion"]
|
||||
# Verify we forced the tool: tool_choice should target submit_checklist.
|
||||
call = client.chat.completions._calls[0]
|
||||
assert "tools" in call
|
||||
assert call["tools"][0]["function"]["name"] == "submit_checklist"
|
||||
# tool_choice should be either {"type":"function","function":{"name":"submit_checklist"}}
|
||||
# or "required" / "auto" if a fallback was used; primary attempt forces it.
|
||||
tc = call["tool_choice"]
|
||||
assert (
|
||||
(isinstance(tc, dict) and tc.get("function", {}).get("name") == "submit_checklist")
|
||||
or tc == "required"
|
||||
or tc == "auto"
|
||||
)
|
||||
|
||||
def test_decompose_falls_back_to_json_content_when_no_tool_call(self, hermes_home):
|
||||
"""If a broken provider returns content instead of a tool call, the
|
||||
backstop JSON parser still salvages a checklist."""
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import decompose_goal
|
||||
|
||||
msg = _FakeMessage(
|
||||
content='{"checklist": [{"text": "salvaged"}]}',
|
||||
tool_calls=[],
|
||||
)
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
items, err = decompose_goal("g")
|
||||
|
||||
assert err is None
|
||||
assert items == [{"text": "salvaged"}]
|
||||
|
||||
def test_decompose_returns_error_when_no_tool_and_no_json(self, hermes_home):
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import decompose_goal
|
||||
|
||||
msg = _FakeMessage(content="I think this should be done in stages.", tool_calls=[])
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
items, err = decompose_goal("g")
|
||||
|
||||
assert items == []
|
||||
assert err and "submit_checklist" in err
|
||||
|
||||
def test_decompose_drops_empty_text_items(self, hermes_home):
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import decompose_goal
|
||||
|
||||
msg = _FakeMessage(
|
||||
tool_calls=[_FakeToolCall(
|
||||
"tc-1", "submit_checklist",
|
||||
{"items": [{"text": "ok"}, {"text": ""}, {"text": " "}, {"text": "two"}]},
|
||||
)],
|
||||
)
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
items, err = decompose_goal("g")
|
||||
|
||||
assert err is None
|
||||
assert [it["text"] for it in items] == ["ok", "two"]
|
||||
|
||||
|
||||
class TestPhaseBToolCall:
|
||||
def test_evaluate_via_update_checklist_tool(self, hermes_home):
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
|
||||
|
||||
state = GoalState(
|
||||
goal="g",
|
||||
decomposed=True,
|
||||
checklist=[
|
||||
ChecklistItem(text="a", status=ITEM_PENDING),
|
||||
ChecklistItem(text="b", status=ITEM_PENDING),
|
||||
],
|
||||
)
|
||||
|
||||
msg = _FakeMessage(
|
||||
tool_calls=[_FakeToolCall(
|
||||
"tc-1", "update_checklist",
|
||||
{
|
||||
# 1-based indices; layer converts to 0-based.
|
||||
"updates": [{"index": 1, "status": "completed", "evidence": "did a"}],
|
||||
"new_items": [{"text": "discovered c"}],
|
||||
"reason": "ran a",
|
||||
},
|
||||
)],
|
||||
)
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
parsed, parse_failed = evaluate_checklist(
|
||||
state, "did the first thing", history_path=None,
|
||||
)
|
||||
|
||||
assert parse_failed is False
|
||||
# Index converted 1 → 0
|
||||
assert parsed["updates"] == [{"index": 0, "status": "completed", "evidence": "did a"}]
|
||||
assert parsed["new_items"] == [{"text": "discovered c"}]
|
||||
assert parsed["reason"] == "ran a"
|
||||
|
||||
def test_evaluate_does_read_file_then_update(self, hermes_home, tmp_path):
|
||||
"""Phase-B tool loop: judge calls read_file once, then update_checklist."""
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
|
||||
|
||||
# Make a real history file so the path-restriction check passes.
|
||||
hist = tmp_path / "hist.json"
|
||||
hist.write_text(json.dumps([{"role": "user", "content": "hi"}]))
|
||||
|
||||
state = GoalState(
|
||||
goal="g",
|
||||
decomposed=True,
|
||||
checklist=[ChecklistItem(text="a", status=ITEM_PENDING)],
|
||||
)
|
||||
|
||||
msg1 = _FakeMessage(tool_calls=[_FakeToolCall(
|
||||
"tc-1", "read_file", {"path": str(hist), "offset": 1, "limit": 100},
|
||||
)])
|
||||
msg2 = _FakeMessage(tool_calls=[_FakeToolCall(
|
||||
"tc-2", "update_checklist",
|
||||
{
|
||||
"updates": [{"index": 1, "status": "completed", "evidence": "saw it"}],
|
||||
"new_items": [],
|
||||
"reason": "verified via read_file",
|
||||
},
|
||||
)])
|
||||
client = _make_fake_client([msg1, msg2])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
parsed, parse_failed = evaluate_checklist(
|
||||
state, "did the thing", history_path=hist,
|
||||
)
|
||||
|
||||
assert parse_failed is False
|
||||
assert parsed["updates"][0]["status"] == "completed"
|
||||
assert parsed["reason"] == "verified via read_file"
|
||||
# Two API calls — one for the read, one for the verdict.
|
||||
assert len(client.chat.completions._calls) == 2
|
||||
|
||||
def test_evaluate_filters_non_terminal_status_in_tool_args(self, hermes_home):
|
||||
"""update_checklist should only accept 'completed' or 'impossible' —
|
||||
any 'pending' updates are dropped at the normalize layer."""
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
|
||||
|
||||
state = GoalState(
|
||||
goal="g",
|
||||
decomposed=True,
|
||||
checklist=[
|
||||
ChecklistItem(text="a", status=ITEM_PENDING),
|
||||
ChecklistItem(text="b", status=ITEM_PENDING),
|
||||
],
|
||||
)
|
||||
msg = _FakeMessage(tool_calls=[_FakeToolCall(
|
||||
"tc-1", "update_checklist",
|
||||
{
|
||||
"updates": [
|
||||
{"index": 1, "status": "completed", "evidence": "yes"},
|
||||
{"index": 2, "status": "pending", "evidence": "skip me"},
|
||||
],
|
||||
"new_items": [],
|
||||
"reason": "...",
|
||||
},
|
||||
)])
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
parsed, _pf = evaluate_checklist(state, "x", history_path=None)
|
||||
|
||||
# Only the completed flip survives; pending update is dropped silently.
|
||||
assert len(parsed["updates"]) == 1
|
||||
assert parsed["updates"][0]["index"] == 0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue