mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(goals): force judge to use tool calls instead of JSON-text replies (#23547)
Live-tested on gemini-3-flash-preview the judge kept returning empty or non-JSON content, tripping the consecutive-parse-failures auto- pause. Free-form JSON output is hopeful; tool-call schemas are enforced server-side by virtually every modern provider. Two new tools the judge calls: - submit_checklist(items) — Phase A, decompose - update_checklist(updates, new_items, reason) — Phase B, evaluate Both phases now call the auxiliary client with tool_choice forcing the right tool. read_file remains for Phase B history inspection, with the loop exiting only when update_checklist is called or the read budget is exhausted (at which point read_file is dropped from the toolbox and update_checklist is forced). Robustness: - _call_judge_with_tool_choice falls back tool_choice forced→required→ auto if the provider rejects a particular shape. - If a fully-broken provider still returns content instead of a tool call, the legacy JSON-text parsers stay around as a last-ditch backstop so we never silently lose a checklist. - _normalize_update_args replaces the JSON parser for the apply layer; same 1-based→0-based conversion + terminal-status filter. Live verification: same fizzbuzz goal that was hitting 'judge model returned unparseable output 3 turns in a row' before now terminates in 2 turns, all 11 items marked completed with item-specific evidence, no auto-pause. Agent log shows 'produced 11 checklist items via tool call' instead of the JSON- parse path. Tests: 7 new cases for the tool-call path (Phase A success, Phase B update only, Phase B read_file→update, JSON-content backstop, empty-text item dropping, non-terminal status filter).
This commit is contained in:
parent
4a080b1d5a
commit
a63a2b7c78
2 changed files with 708 additions and 156 deletions
|
|
@ -143,8 +143,9 @@ DECOMPOSE_SYSTEM_PROMPT = (
|
|||
"publicly accessible, domain/URL is functional, etc. Better to "
|
||||
"over-specify and let a few items get marked impossible than to "
|
||||
"under-specify and let the agent declare victory early.\n\n"
|
||||
"Reply ONLY with a single JSON object on one line:\n"
|
||||
'{"checklist": [{"text": "<item>"}, {"text": "<item>"}, ...]}'
|
||||
"Submit your checklist by calling the ``submit_checklist`` tool. Do "
|
||||
"not reply with prose or JSON in your message body — call the tool. "
|
||||
"The system will not see anything you write outside the tool call."
|
||||
)
|
||||
|
||||
DECOMPOSE_USER_PROMPT_TEMPLATE = (
|
||||
|
|
@ -196,23 +197,20 @@ EVALUATE_SYSTEM_PROMPT_CHECKLIST = (
|
|||
"demonstrates the item cannot be achieved in this environment (NOT "
|
||||
"merely that the agent didn't try). Vague intentions ('I will do X "
|
||||
"next') do NOT count as completion.\n\n"
|
||||
"You may APPEND new checklist items if the agent's work reveals "
|
||||
"criteria the original decomposition missed. Stay strict — only add "
|
||||
"items that genuinely belong as completion criteria.\n\n"
|
||||
"STICKINESS: items already marked completed or impossible are frozen. "
|
||||
"Do not include them in your updates. Only the user can revert them.\n\n"
|
||||
"TOOLS: you have read_file(path, offset, limit) available. The full "
|
||||
"conversation history for this session is dumped to a JSON file whose "
|
||||
"path is given in the user message. Call read_file on it when the "
|
||||
"snippet is ambiguous, when you need to verify a tool call actually "
|
||||
"happened, or when you want to see what skills the agent loaded. "
|
||||
"Otherwise, judge from the snippet directly — extra reads cost tokens.\n\n"
|
||||
"When you are ready to rule, reply ONLY with a single JSON object:\n"
|
||||
'{"updates": [{"index": <i>, "status": "completed|impossible", "evidence": "<why>"}, ...], '
|
||||
'"new_items": [{"text": "<new item>"}, ...], '
|
||||
'"reason": "<one-sentence overall rationale>"}\n'
|
||||
"When citing evidence, reference the agent's actual output specifically. "
|
||||
"Empty updates is fine. Empty new_items is fine. The reason field is required."
|
||||
"TOOLS:\n"
|
||||
"- ``read_file(path, offset, limit)``: inspect the dumped conversation "
|
||||
"history file whose path is given in the user message. Use this when "
|
||||
"the snippet alone isn't enough to rule. Each call costs tokens, so "
|
||||
"only read when needed.\n"
|
||||
"- ``update_checklist(updates, new_items, reason)``: issue your "
|
||||
"verdict. Call this exactly once per turn when you are ready to rule. "
|
||||
"Calling it ENDS the evaluation.\n\n"
|
||||
"You MUST call one of these tools every turn. Do not reply with "
|
||||
"prose or JSON in your message body — the system will not see "
|
||||
"anything written outside tool calls. When you cite evidence, "
|
||||
"reference the agent's actual output specifically."
|
||||
)
|
||||
|
||||
EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE = (
|
||||
|
|
@ -645,6 +643,17 @@ def _parse_evaluate_response(raw: str) -> Tuple[Dict[str, Any], bool]:
|
|||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Judge tool schemas: read_file (history inspection) +
|
||||
# submit_checklist (Phase A) + update_checklist (Phase B)
|
||||
#
|
||||
# Forcing the judge to emit through tool calls is dramatically more
|
||||
# reliable than asking it to reply with JSON text. Most providers
|
||||
# enforce the schema server-side, so weak/small judge models can no
|
||||
# longer drift into prose, markdown fences, or empty bodies.
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
_JUDGE_READ_FILE_TOOL_SCHEMA: Dict[str, Any] = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
|
|
@ -685,6 +694,122 @@ _JUDGE_READ_FILE_TOOL_SCHEMA: Dict[str, Any] = {
|
|||
}
|
||||
|
||||
|
||||
_JUDGE_SUBMIT_CHECKLIST_TOOL_SCHEMA: Dict[str, Any] = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "submit_checklist",
|
||||
"description": (
|
||||
"Submit the harsh, detailed completion-criteria checklist you "
|
||||
"decomposed the goal into. Each item is one verifiable "
|
||||
"completion criterion. Bias toward more items, not fewer."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"description": (
|
||||
"List of checklist items. Each item is a single "
|
||||
"verifiable statement of fact about the finished "
|
||||
"work. Aim for at least 5 items; more is better "
|
||||
"when warranted."
|
||||
),
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The completion-criterion text.",
|
||||
},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["items"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA: Dict[str, Any] = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "update_checklist",
|
||||
"description": (
|
||||
"Issue your verdict on the current checklist. For each "
|
||||
"currently-pending item, decide whether the agent's most "
|
||||
"recent response (and conversation history if you read it) "
|
||||
"shows the item is satisfied. You may also append new items "
|
||||
"the original decomposition missed. Call this exactly once "
|
||||
"when you are ready to rule — calling it ends the evaluation."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"updates": {
|
||||
"type": "array",
|
||||
"description": (
|
||||
"Per-item rulings. Use the 1-based ``index`` shown "
|
||||
"in the checklist. ``status`` must be 'completed' "
|
||||
"(clear evidence the item is done) or 'impossible' "
|
||||
"(item cannot be achieved in this environment). "
|
||||
"Items already in a terminal status are frozen — "
|
||||
"do not include them."
|
||||
),
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "1-based checklist index.",
|
||||
},
|
||||
"status": {
|
||||
"type": "string",
|
||||
"enum": ["completed", "impossible"],
|
||||
},
|
||||
"evidence": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"One-sentence specific citation of why "
|
||||
"this item is done or impossible. "
|
||||
"Reference the agent's actual output."
|
||||
),
|
||||
},
|
||||
},
|
||||
"required": ["index", "status", "evidence"],
|
||||
},
|
||||
},
|
||||
"new_items": {
|
||||
"type": "array",
|
||||
"description": (
|
||||
"Optional: completion criteria the original "
|
||||
"decomposition missed. Stay strict — only add "
|
||||
"items that genuinely belong as completion "
|
||||
"criteria for this goal."
|
||||
),
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The new criterion text.",
|
||||
},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
},
|
||||
"reason": {
|
||||
"type": "string",
|
||||
"description": "One-sentence overall rationale for this round of updates.",
|
||||
},
|
||||
},
|
||||
"required": ["updates", "new_items", "reason"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _judge_read_file(
|
||||
path: str,
|
||||
*,
|
||||
|
|
@ -770,15 +895,125 @@ def _get_judge_client() -> Tuple[Optional[Any], str]:
|
|||
return client, model
|
||||
|
||||
|
||||
def _extract_tool_call(msg: Any, tool_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""Find a tool call by name on a chat-completions message. Returns
|
||||
``{"id", "name", "arguments": <dict>}`` or None.
|
||||
|
||||
Robust to provider shims that return tool_calls as objects or dicts
|
||||
and arguments as JSON strings or already-parsed dicts.
|
||||
"""
|
||||
tool_calls = getattr(msg, "tool_calls", None) or []
|
||||
for tc in tool_calls:
|
||||
try:
|
||||
tc_id = getattr(tc, "id", None) or (tc.get("id") if isinstance(tc, dict) else None) or "tc-?"
|
||||
fn = getattr(tc, "function", None) or (tc.get("function") if isinstance(tc, dict) else None)
|
||||
if fn is None:
|
||||
continue
|
||||
fn_name = getattr(fn, "name", None) or (fn.get("name") if isinstance(fn, dict) else "")
|
||||
if fn_name != tool_name:
|
||||
continue
|
||||
fn_args_raw = getattr(fn, "arguments", None) or (fn.get("arguments") if isinstance(fn, dict) else "")
|
||||
if isinstance(fn_args_raw, str):
|
||||
try:
|
||||
args = json.loads(fn_args_raw) if fn_args_raw else {}
|
||||
except Exception:
|
||||
args = {}
|
||||
elif isinstance(fn_args_raw, dict):
|
||||
args = fn_args_raw
|
||||
else:
|
||||
args = {}
|
||||
return {"id": tc_id, "name": fn_name, "arguments": args}
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def _serialize_assistant_tool_calls(msg: Any) -> List[Dict[str, Any]]:
|
||||
"""Convert a provider-shim tool_calls list into plain-dict form for
|
||||
inclusion in subsequent ``messages=[...]`` payloads."""
|
||||
out: List[Dict[str, Any]] = []
|
||||
for tc in getattr(msg, "tool_calls", None) or []:
|
||||
try:
|
||||
tc_id = getattr(tc, "id", None) or (tc.get("id") if isinstance(tc, dict) else None) or "tc-?"
|
||||
fn = getattr(tc, "function", None) or (tc.get("function") if isinstance(tc, dict) else None)
|
||||
fn_name = getattr(fn, "name", None) or (fn.get("name") if isinstance(fn, dict) else "")
|
||||
fn_args = getattr(fn, "arguments", None) or (fn.get("arguments") if isinstance(fn, dict) else "")
|
||||
if not isinstance(fn_args, str):
|
||||
try:
|
||||
fn_args = json.dumps(fn_args)
|
||||
except Exception:
|
||||
fn_args = "{}"
|
||||
out.append({
|
||||
"id": tc_id,
|
||||
"type": "function",
|
||||
"function": {"name": fn_name or "", "arguments": fn_args},
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
return out
|
||||
|
||||
|
||||
def _call_judge_with_tool_choice(
|
||||
client: Any,
|
||||
*,
|
||||
model: str,
|
||||
messages: List[Dict[str, Any]],
|
||||
tools: List[Dict[str, Any]],
|
||||
forced_tool_name: Optional[str],
|
||||
timeout: float,
|
||||
max_tokens: int = 1500,
|
||||
) -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Call the judge with a forced tool choice, falling back to ``auto``
|
||||
if the provider rejects ``required`` / a specific function choice.
|
||||
|
||||
Returns ``(response, error)``. On success, ``error`` is None.
|
||||
"""
|
||||
# First attempt: force the specific tool. Most modern providers
|
||||
# support {"type": "function", "function": {"name": "..."}}.
|
||||
primary_choice: Any
|
||||
if forced_tool_name:
|
||||
primary_choice = {"type": "function", "function": {"name": forced_tool_name}}
|
||||
else:
|
||||
primary_choice = "required"
|
||||
|
||||
attempts: List[Any] = [primary_choice, "required", "auto"]
|
||||
last_err: Optional[str] = None
|
||||
for choice in attempts:
|
||||
try:
|
||||
return client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
tool_choice=choice,
|
||||
temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
timeout=timeout,
|
||||
), None
|
||||
except Exception as exc:
|
||||
last_err = f"{type(exc).__name__}: {exc}"
|
||||
# Only retry on errors that look like the provider rejecting the
|
||||
# tool_choice shape. Network errors etc. should bail immediately.
|
||||
msg = str(exc).lower()
|
||||
if not any(token in msg for token in (
|
||||
"tool_choice", "tool choice", "required", "function call",
|
||||
"unsupported", "not supported", "invalid", "400",
|
||||
)):
|
||||
return None, last_err
|
||||
logger.debug("goal judge: tool_choice=%r rejected (%s); falling back", choice, exc)
|
||||
continue
|
||||
return None, last_err or "all tool_choice fallbacks failed"
|
||||
|
||||
|
||||
def decompose_goal(
|
||||
goal: str,
|
||||
*,
|
||||
timeout: float = DEFAULT_JUDGE_TIMEOUT,
|
||||
) -> Tuple[List[Dict[str, Any]], Optional[str]]:
|
||||
"""Phase-A: ask the judge to break the goal into a checklist.
|
||||
"""Phase-A: ask the judge to break the goal into a checklist via a
|
||||
forced ``submit_checklist`` tool call.
|
||||
|
||||
Returns ``(items, error)``. On any failure, returns ``([], reason)`` so
|
||||
the caller can decide whether to fall back to freeform mode.
|
||||
Returns ``(items, error)``. On any failure, returns ``([], reason)``
|
||||
so the caller can fall back to freeform mode.
|
||||
"""
|
||||
if not goal.strip():
|
||||
return [], "empty goal"
|
||||
|
|
@ -787,39 +1022,68 @@ def decompose_goal(
|
|||
if client is None:
|
||||
return [], "auxiliary client unavailable"
|
||||
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": DECOMPOSE_SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": DECOMPOSE_USER_PROMPT_TEMPLATE.format(
|
||||
goal=_truncate(goal, 4000)
|
||||
),
|
||||
},
|
||||
],
|
||||
temperature=0,
|
||||
max_tokens=2000,
|
||||
timeout=timeout,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.info("goal decompose: API call failed (%s)", exc)
|
||||
return [], f"decompose error: {type(exc).__name__}"
|
||||
messages = [
|
||||
{"role": "system", "content": DECOMPOSE_SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": DECOMPOSE_USER_PROMPT_TEMPLATE.format(
|
||||
goal=_truncate(goal, 4000)
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
resp, err = _call_judge_with_tool_choice(
|
||||
client,
|
||||
model=model,
|
||||
messages=messages,
|
||||
tools=[_JUDGE_SUBMIT_CHECKLIST_TOOL_SCHEMA],
|
||||
forced_tool_name="submit_checklist",
|
||||
timeout=timeout,
|
||||
max_tokens=2000,
|
||||
)
|
||||
if resp is None:
|
||||
logger.info("goal decompose: API call failed (%s)", err)
|
||||
return [], f"decompose error: {err}"
|
||||
|
||||
try:
|
||||
raw = resp.choices[0].message.content or ""
|
||||
msg = resp.choices[0].message
|
||||
except Exception:
|
||||
raw = ""
|
||||
return [], "decompose response malformed"
|
||||
|
||||
items, parse_failed = _parse_decompose_response(raw)
|
||||
if parse_failed or not items:
|
||||
logger.info(
|
||||
"goal decompose: parse failed or empty checklist (raw=%r)",
|
||||
_truncate(raw, 200),
|
||||
)
|
||||
return [], "decompose parse failed or empty"
|
||||
logger.info("goal decompose: produced %d checklist items", len(items))
|
||||
tc = _extract_tool_call(msg, "submit_checklist")
|
||||
if tc is None:
|
||||
# Provider responded but didn't call the tool. Try parsing content
|
||||
# as a last-ditch backstop so a fully-broken provider doesn't
|
||||
# silently leave the user with no checklist at all.
|
||||
content = getattr(msg, "content", "") or ""
|
||||
items, parse_failed = _parse_decompose_response(content)
|
||||
if parse_failed or not items:
|
||||
logger.info(
|
||||
"goal decompose: no submit_checklist tool call AND no parseable JSON (raw=%r)",
|
||||
_truncate(content, 200),
|
||||
)
|
||||
return [], "decompose: judge did not call submit_checklist"
|
||||
logger.info("goal decompose: fell back to JSON-content parser (%d items)", len(items))
|
||||
return items, None
|
||||
|
||||
raw_items = tc["arguments"].get("items") or []
|
||||
items: List[Dict[str, Any]] = []
|
||||
if isinstance(raw_items, list):
|
||||
for entry in raw_items:
|
||||
if isinstance(entry, dict):
|
||||
text = str(entry.get("text", "")).strip()
|
||||
if text:
|
||||
items.append({"text": text})
|
||||
elif isinstance(entry, str):
|
||||
text = entry.strip()
|
||||
if text:
|
||||
items.append({"text": text})
|
||||
|
||||
if not items:
|
||||
logger.info("goal decompose: submit_checklist returned empty items list")
|
||||
return [], "decompose: empty checklist"
|
||||
|
||||
logger.info("goal decompose: produced %d checklist items via tool call", len(items))
|
||||
return items, None
|
||||
|
||||
|
||||
|
|
@ -882,10 +1146,15 @@ def evaluate_checklist(
|
|||
timeout: float = DEFAULT_JUDGE_TIMEOUT,
|
||||
max_tool_calls: int = DEFAULT_MAX_JUDGE_TOOL_CALLS,
|
||||
) -> Tuple[Dict[str, Any], bool]:
|
||||
"""Phase-B: judge evaluates each pending checklist item.
|
||||
"""Phase-B: judge evaluates each pending checklist item via forced
|
||||
tool calls.
|
||||
|
||||
Runs a bounded tool loop so the judge can call ``read_file`` on the
|
||||
dumped conversation history when the snippet isn't enough.
|
||||
The judge has two tools available:
|
||||
- ``read_file``: inspect the dumped conversation history
|
||||
- ``update_checklist``: issue the verdict (terminates the loop)
|
||||
|
||||
``tool_choice="required"`` forces one of them every iteration. We loop
|
||||
until ``update_checklist`` is called or ``max_tool_calls`` is exhausted.
|
||||
|
||||
Returns ``(parsed, parse_failed)`` where parsed is
|
||||
``{"updates": [...], "new_items": [...], "reason": str}``.
|
||||
|
|
@ -895,7 +1164,8 @@ def evaluate_checklist(
|
|||
if client is None:
|
||||
return ({"updates": [], "new_items": [], "reason": "auxiliary client unavailable"}, False)
|
||||
|
||||
# Render checklist with 1-based indices the judge can address.
|
||||
# Render checklist with 1-based indices the judge addresses via the
|
||||
# update_checklist tool's ``index`` field.
|
||||
checklist_block = state.render_checklist(numbered=True)
|
||||
|
||||
user_prompt = EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE.format(
|
||||
|
|
@ -910,136 +1180,170 @@ def evaluate_checklist(
|
|||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
# Some auxiliary providers may not support tool calls. We pass tools
|
||||
# optimistically; if the provider returns a verdict directly without
|
||||
# using them, we just parse it.
|
||||
tools = [_JUDGE_READ_FILE_TOOL_SCHEMA] if history_path is not None else None
|
||||
# Build the toolbox: read_file is only useful when we actually have a
|
||||
# history file to read, so we omit it otherwise to keep the schema lean.
|
||||
tools: List[Dict[str, Any]] = [_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA]
|
||||
if history_path is not None:
|
||||
tools.insert(0, _JUDGE_READ_FILE_TOOL_SCHEMA)
|
||||
|
||||
tool_calls_left = max(0, int(max_tool_calls))
|
||||
final_raw = ""
|
||||
reads_left = max(0, int(max_tool_calls)) if history_path is not None else 0
|
||||
|
||||
for _ in range(tool_calls_left + 1):
|
||||
try:
|
||||
kwargs: Dict[str, Any] = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"temperature": 0,
|
||||
"max_tokens": 1500,
|
||||
"timeout": timeout,
|
||||
}
|
||||
if tools:
|
||||
kwargs["tools"] = tools
|
||||
kwargs["tool_choice"] = "auto"
|
||||
resp = client.chat.completions.create(**kwargs)
|
||||
except Exception as exc:
|
||||
logger.info("goal judge (checklist): API call failed (%s)", exc)
|
||||
# Bound the overall loop generously — the judge will normally finish in
|
||||
# one or two passes (read_file once, then update_checklist; or just
|
||||
# update_checklist directly).
|
||||
for iteration in range(reads_left + 2):
|
||||
# When out of read budget, drop read_file from the toolbox so the
|
||||
# judge MUST emit update_checklist.
|
||||
loop_tools = tools if reads_left > 0 else [_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA]
|
||||
# Forcing update_checklist directly when reads are exhausted gives
|
||||
# us the strongest guarantee of termination.
|
||||
forced = "update_checklist" if reads_left <= 0 else None
|
||||
|
||||
resp, err = _call_judge_with_tool_choice(
|
||||
client,
|
||||
model=model,
|
||||
messages=messages,
|
||||
tools=loop_tools,
|
||||
forced_tool_name=forced,
|
||||
timeout=timeout,
|
||||
max_tokens=1500,
|
||||
)
|
||||
if resp is None:
|
||||
logger.info("goal judge (checklist): API call failed (%s)", err)
|
||||
return (
|
||||
{
|
||||
"updates": [],
|
||||
"new_items": [],
|
||||
"reason": f"judge error: {type(exc).__name__}",
|
||||
"reason": f"judge error: {err}",
|
||||
},
|
||||
False,
|
||||
)
|
||||
|
||||
try:
|
||||
choice = resp.choices[0]
|
||||
msg = choice.message
|
||||
msg = resp.choices[0].message
|
||||
except Exception:
|
||||
return (
|
||||
{"updates": [], "new_items": [], "reason": "judge response malformed"},
|
||||
True,
|
||||
)
|
||||
|
||||
# Unpack tool_calls in a way that works for openai-py and other shims.
|
||||
tool_calls = getattr(msg, "tool_calls", None) or []
|
||||
content = getattr(msg, "content", "") or ""
|
||||
# Did the judge call update_checklist? If yes, we're done.
|
||||
update_tc = _extract_tool_call(msg, "update_checklist")
|
||||
if update_tc is not None:
|
||||
parsed = _normalize_update_args(update_tc["arguments"])
|
||||
logger.info(
|
||||
"goal judge (checklist): updates=%d new_items=%d reason=%s",
|
||||
len(parsed.get("updates") or []),
|
||||
len(parsed.get("new_items") or []),
|
||||
_truncate(parsed.get("reason", ""), 120),
|
||||
)
|
||||
return parsed, False
|
||||
|
||||
if not tool_calls:
|
||||
final_raw = content
|
||||
break
|
||||
|
||||
if tool_calls_left <= 0:
|
||||
# Out of budget. Force a final ruling on the next pass by
|
||||
# appending a system note and disabling tools.
|
||||
# Did the judge call read_file? If yes, run it and feed the result back.
|
||||
read_tc = _extract_tool_call(msg, "read_file")
|
||||
if read_tc is not None and reads_left > 0:
|
||||
args = read_tc["arguments"]
|
||||
tool_result = _judge_read_file(
|
||||
str(args.get("path", "")),
|
||||
offset=args.get("offset", 1),
|
||||
limit=args.get("limit", _JUDGE_READ_FILE_MAX_LINES),
|
||||
allowed_path=history_path,
|
||||
)
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": (
|
||||
"You have exhausted your read_file budget. Issue your "
|
||||
"final JSON verdict now without calling more tools."
|
||||
),
|
||||
"role": "assistant",
|
||||
"content": getattr(msg, "content", "") or "",
|
||||
"tool_calls": _serialize_assistant_tool_calls(msg),
|
||||
})
|
||||
tools = None
|
||||
continue
|
||||
|
||||
# Append the assistant turn, then handle each tool call.
|
||||
assistant_record: Dict[str, Any] = {
|
||||
"role": "assistant",
|
||||
"content": content,
|
||||
"tool_calls": [],
|
||||
}
|
||||
for tc in tool_calls:
|
||||
try:
|
||||
tc_id = getattr(tc, "id", None) or "tc-?"
|
||||
fn = getattr(tc, "function", None)
|
||||
fn_name = getattr(fn, "name", "") if fn is not None else ""
|
||||
fn_args = getattr(fn, "arguments", "") if fn is not None else ""
|
||||
assistant_record["tool_calls"].append({
|
||||
"id": tc_id,
|
||||
"type": "function",
|
||||
"function": {"name": fn_name, "arguments": fn_args},
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
messages.append(assistant_record)
|
||||
|
||||
for tc in tool_calls:
|
||||
try:
|
||||
tc_id = getattr(tc, "id", None) or "tc-?"
|
||||
fn = getattr(tc, "function", None)
|
||||
fn_name = getattr(fn, "name", "") if fn is not None else ""
|
||||
fn_args_raw = getattr(fn, "arguments", "") if fn is not None else ""
|
||||
except Exception:
|
||||
continue
|
||||
try:
|
||||
args = json.loads(fn_args_raw) if isinstance(fn_args_raw, str) else (fn_args_raw or {})
|
||||
except Exception:
|
||||
args = {}
|
||||
if fn_name == "read_file":
|
||||
tool_result = _judge_read_file(
|
||||
str(args.get("path", "")),
|
||||
offset=args.get("offset", 1),
|
||||
limit=args.get("limit", _JUDGE_READ_FILE_MAX_LINES),
|
||||
allowed_path=history_path,
|
||||
)
|
||||
else:
|
||||
tool_result = json.dumps({"error": f"unknown tool: {fn_name}"})
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tc_id,
|
||||
"name": fn_name,
|
||||
"tool_call_id": read_tc["id"],
|
||||
"name": "read_file",
|
||||
"content": tool_result,
|
||||
})
|
||||
tool_calls_left -= 1
|
||||
reads_left -= 1
|
||||
continue
|
||||
|
||||
if tool_calls_left <= 0:
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": (
|
||||
"You have exhausted your read_file budget. Issue your "
|
||||
"final JSON verdict now without calling more tools."
|
||||
),
|
||||
})
|
||||
tools = None
|
||||
# Neither tool was called. Try parsing the content body as a last-
|
||||
# ditch backstop, then bail.
|
||||
content = getattr(msg, "content", "") or ""
|
||||
if content.strip():
|
||||
parsed, parse_failed = _parse_evaluate_response(content)
|
||||
if not parse_failed:
|
||||
logger.info(
|
||||
"goal judge (checklist): fell back to JSON-content parser "
|
||||
"updates=%d new_items=%d",
|
||||
len(parsed.get("updates") or []),
|
||||
len(parsed.get("new_items") or []),
|
||||
)
|
||||
return parsed, False
|
||||
logger.info(
|
||||
"goal judge (checklist): judge emitted neither read_file nor "
|
||||
"update_checklist (iteration=%d, content=%r) — bailing",
|
||||
iteration, _truncate(content, 120),
|
||||
)
|
||||
return (
|
||||
{
|
||||
"updates": [],
|
||||
"new_items": [],
|
||||
"reason": "judge did not call update_checklist",
|
||||
},
|
||||
True,
|
||||
)
|
||||
|
||||
parsed, parse_failed = _parse_evaluate_response(final_raw)
|
||||
logger.info(
|
||||
"goal judge (checklist): updates=%d new_items=%d reason=%s",
|
||||
len(parsed.get("updates") or []),
|
||||
len(parsed.get("new_items") or []),
|
||||
_truncate(parsed.get("reason", ""), 120),
|
||||
# Loop exhausted without an update_checklist call.
|
||||
return (
|
||||
{
|
||||
"updates": [],
|
||||
"new_items": [],
|
||||
"reason": "judge tool-loop exhausted without verdict",
|
||||
},
|
||||
True,
|
||||
)
|
||||
return parsed, parse_failed
|
||||
|
||||
|
||||
def _normalize_update_args(args: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate and normalize the ``update_checklist`` tool arguments.
|
||||
|
||||
Performs the same 1-based → 0-based conversion and terminal-status
|
||||
filter as ``_parse_evaluate_response``. Returns the canonical
|
||||
``{updates, new_items, reason}`` shape callers expect.
|
||||
"""
|
||||
raw_updates = args.get("updates") or []
|
||||
raw_new = args.get("new_items") or []
|
||||
reason = str(args.get("reason") or "").strip() or "no reason provided"
|
||||
|
||||
norm_updates: List[Dict[str, Any]] = []
|
||||
if isinstance(raw_updates, list):
|
||||
for upd in raw_updates:
|
||||
if not isinstance(upd, dict):
|
||||
continue
|
||||
try:
|
||||
idx_1based = int(upd.get("index"))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
status = str(upd.get("status", "")).strip().lower()
|
||||
if status not in TERMINAL_ITEM_STATUSES:
|
||||
continue
|
||||
evidence = str(upd.get("evidence") or "").strip() or None
|
||||
norm_updates.append({
|
||||
"index": idx_1based - 1, # 1-based → 0-based for apply layer
|
||||
"status": status,
|
||||
"evidence": evidence,
|
||||
})
|
||||
|
||||
norm_new: List[Dict[str, Any]] = []
|
||||
if isinstance(raw_new, list):
|
||||
for it in raw_new:
|
||||
if isinstance(it, dict):
|
||||
text = str(it.get("text", "")).strip()
|
||||
if text:
|
||||
norm_new.append({"text": text})
|
||||
elif isinstance(it, str):
|
||||
text = it.strip()
|
||||
if text:
|
||||
norm_new.append({"text": text})
|
||||
|
||||
return {"updates": norm_updates, "new_items": norm_new, "reason": reason}
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
|
|
|||
|
|
@ -1127,3 +1127,251 @@ class TestGoalSurvivesCompressionRotation:
|
|||
|
||||
# Child should still have no goal.
|
||||
assert load_goal("child-no-goal") is None
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Forced tool-call judge: submit_checklist (Phase A) + update_checklist (Phase B)
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _FakeFn:
|
||||
def __init__(self, name, args):
|
||||
self.name = name
|
||||
self.arguments = args if isinstance(args, str) else json.dumps(args)
|
||||
|
||||
|
||||
class _FakeToolCall:
|
||||
def __init__(self, tc_id, name, args):
|
||||
self.id = tc_id
|
||||
self.type = "function"
|
||||
self.function = _FakeFn(name, args)
|
||||
|
||||
|
||||
class _FakeMessage:
|
||||
def __init__(self, *, content="", tool_calls=None):
|
||||
self.content = content
|
||||
self.tool_calls = tool_calls or []
|
||||
|
||||
|
||||
class _FakeChoice:
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
def __init__(self, message):
|
||||
self.choices = [_FakeChoice(message)]
|
||||
|
||||
|
||||
def _make_fake_client(scripted_messages):
|
||||
"""Return a fake client whose .chat.completions.create() returns the
|
||||
next scripted message each call. Mutates the underlying list as a
|
||||
queue so repeat calls advance.
|
||||
"""
|
||||
class FakeClient:
|
||||
class chat:
|
||||
class completions:
|
||||
_queue = list(scripted_messages)
|
||||
_calls = []
|
||||
|
||||
@classmethod
|
||||
def create(cls, **kwargs):
|
||||
cls._calls.append(kwargs)
|
||||
if not cls._queue:
|
||||
raise RuntimeError("scripted-message queue exhausted")
|
||||
return _FakeResponse(cls._queue.pop(0))
|
||||
|
||||
return FakeClient
|
||||
|
||||
|
||||
class TestPhaseAToolCall:
|
||||
def test_decompose_via_submit_checklist_tool(self, hermes_home):
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import decompose_goal
|
||||
|
||||
msg = _FakeMessage(
|
||||
tool_calls=[_FakeToolCall(
|
||||
"tc-1", "submit_checklist",
|
||||
{"items": [{"text": "first criterion"}, {"text": "second criterion"}]},
|
||||
)],
|
||||
)
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
items, err = decompose_goal("build a website")
|
||||
|
||||
assert err is None
|
||||
assert [it["text"] for it in items] == ["first criterion", "second criterion"]
|
||||
# Verify we forced the tool: tool_choice should target submit_checklist.
|
||||
call = client.chat.completions._calls[0]
|
||||
assert "tools" in call
|
||||
assert call["tools"][0]["function"]["name"] == "submit_checklist"
|
||||
# tool_choice should be either {"type":"function","function":{"name":"submit_checklist"}}
|
||||
# or "required" / "auto" if a fallback was used; primary attempt forces it.
|
||||
tc = call["tool_choice"]
|
||||
assert (
|
||||
(isinstance(tc, dict) and tc.get("function", {}).get("name") == "submit_checklist")
|
||||
or tc == "required"
|
||||
or tc == "auto"
|
||||
)
|
||||
|
||||
def test_decompose_falls_back_to_json_content_when_no_tool_call(self, hermes_home):
|
||||
"""If a broken provider returns content instead of a tool call, the
|
||||
backstop JSON parser still salvages a checklist."""
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import decompose_goal
|
||||
|
||||
msg = _FakeMessage(
|
||||
content='{"checklist": [{"text": "salvaged"}]}',
|
||||
tool_calls=[],
|
||||
)
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
items, err = decompose_goal("g")
|
||||
|
||||
assert err is None
|
||||
assert items == [{"text": "salvaged"}]
|
||||
|
||||
def test_decompose_returns_error_when_no_tool_and_no_json(self, hermes_home):
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import decompose_goal
|
||||
|
||||
msg = _FakeMessage(content="I think this should be done in stages.", tool_calls=[])
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
items, err = decompose_goal("g")
|
||||
|
||||
assert items == []
|
||||
assert err and "submit_checklist" in err
|
||||
|
||||
def test_decompose_drops_empty_text_items(self, hermes_home):
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import decompose_goal
|
||||
|
||||
msg = _FakeMessage(
|
||||
tool_calls=[_FakeToolCall(
|
||||
"tc-1", "submit_checklist",
|
||||
{"items": [{"text": "ok"}, {"text": ""}, {"text": " "}, {"text": "two"}]},
|
||||
)],
|
||||
)
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
items, err = decompose_goal("g")
|
||||
|
||||
assert err is None
|
||||
assert [it["text"] for it in items] == ["ok", "two"]
|
||||
|
||||
|
||||
class TestPhaseBToolCall:
|
||||
def test_evaluate_via_update_checklist_tool(self, hermes_home):
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
|
||||
|
||||
state = GoalState(
|
||||
goal="g",
|
||||
decomposed=True,
|
||||
checklist=[
|
||||
ChecklistItem(text="a", status=ITEM_PENDING),
|
||||
ChecklistItem(text="b", status=ITEM_PENDING),
|
||||
],
|
||||
)
|
||||
|
||||
msg = _FakeMessage(
|
||||
tool_calls=[_FakeToolCall(
|
||||
"tc-1", "update_checklist",
|
||||
{
|
||||
# 1-based indices; layer converts to 0-based.
|
||||
"updates": [{"index": 1, "status": "completed", "evidence": "did a"}],
|
||||
"new_items": [{"text": "discovered c"}],
|
||||
"reason": "ran a",
|
||||
},
|
||||
)],
|
||||
)
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
parsed, parse_failed = evaluate_checklist(
|
||||
state, "did the first thing", history_path=None,
|
||||
)
|
||||
|
||||
assert parse_failed is False
|
||||
# Index converted 1 → 0
|
||||
assert parsed["updates"] == [{"index": 0, "status": "completed", "evidence": "did a"}]
|
||||
assert parsed["new_items"] == [{"text": "discovered c"}]
|
||||
assert parsed["reason"] == "ran a"
|
||||
|
||||
def test_evaluate_does_read_file_then_update(self, hermes_home, tmp_path):
|
||||
"""Phase-B tool loop: judge calls read_file once, then update_checklist."""
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
|
||||
|
||||
# Make a real history file so the path-restriction check passes.
|
||||
hist = tmp_path / "hist.json"
|
||||
hist.write_text(json.dumps([{"role": "user", "content": "hi"}]))
|
||||
|
||||
state = GoalState(
|
||||
goal="g",
|
||||
decomposed=True,
|
||||
checklist=[ChecklistItem(text="a", status=ITEM_PENDING)],
|
||||
)
|
||||
|
||||
msg1 = _FakeMessage(tool_calls=[_FakeToolCall(
|
||||
"tc-1", "read_file", {"path": str(hist), "offset": 1, "limit": 100},
|
||||
)])
|
||||
msg2 = _FakeMessage(tool_calls=[_FakeToolCall(
|
||||
"tc-2", "update_checklist",
|
||||
{
|
||||
"updates": [{"index": 1, "status": "completed", "evidence": "saw it"}],
|
||||
"new_items": [],
|
||||
"reason": "verified via read_file",
|
||||
},
|
||||
)])
|
||||
client = _make_fake_client([msg1, msg2])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
parsed, parse_failed = evaluate_checklist(
|
||||
state, "did the thing", history_path=hist,
|
||||
)
|
||||
|
||||
assert parse_failed is False
|
||||
assert parsed["updates"][0]["status"] == "completed"
|
||||
assert parsed["reason"] == "verified via read_file"
|
||||
# Two API calls — one for the read, one for the verdict.
|
||||
assert len(client.chat.completions._calls) == 2
|
||||
|
||||
def test_evaluate_filters_non_terminal_status_in_tool_args(self, hermes_home):
|
||||
"""update_checklist should only accept 'completed' or 'impossible' —
|
||||
any 'pending' updates are dropped at the normalize layer."""
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
|
||||
|
||||
state = GoalState(
|
||||
goal="g",
|
||||
decomposed=True,
|
||||
checklist=[
|
||||
ChecklistItem(text="a", status=ITEM_PENDING),
|
||||
ChecklistItem(text="b", status=ITEM_PENDING),
|
||||
],
|
||||
)
|
||||
msg = _FakeMessage(tool_calls=[_FakeToolCall(
|
||||
"tc-1", "update_checklist",
|
||||
{
|
||||
"updates": [
|
||||
{"index": 1, "status": "completed", "evidence": "yes"},
|
||||
{"index": 2, "status": "pending", "evidence": "skip me"},
|
||||
],
|
||||
"new_items": [],
|
||||
"reason": "...",
|
||||
},
|
||||
)])
|
||||
client = _make_fake_client([msg])
|
||||
|
||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
||||
parsed, _pf = evaluate_checklist(state, "x", history_path=None)
|
||||
|
||||
# Only the completed flip survives; pending update is dropped silently.
|
||||
assert len(parsed["updates"]) == 1
|
||||
assert parsed["updates"][0]["index"] == 0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue