mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
revert: roll back /goal checklist + /subgoal feature stack (#23813)
* Revert "fix(goals): force judge to use tool calls instead of JSON-text replies (#23547)" This reverts commita63a2b7c78. * Revert "fix(goals): forward standing /goal state on auto-compression session rotation (#23530)" This reverts commit4a080b1d5a. * Revert "feat(goals): /goal checklist + /subgoal user controls (#23456)" This reverts commit404640a2b7.
This commit is contained in:
parent
1d4a4997b1
commit
3e7145e0bb
9 changed files with 113 additions and 2487 deletions
105
cli.py
105
cli.py
|
|
@ -7252,8 +7252,6 @@ class HermesCLI:
|
||||||
_cprint(f" No agent running; queued as next turn: {payload[:80]}{'...' if len(payload) > 80 else ''}")
|
_cprint(f" No agent running; queued as next turn: {payload[:80]}{'...' if len(payload) > 80 else ''}")
|
||||||
elif canonical == "goal":
|
elif canonical == "goal":
|
||||||
self._handle_goal_command(cmd_original)
|
self._handle_goal_command(cmd_original)
|
||||||
elif canonical == "subgoal":
|
|
||||||
self._handle_subgoal_command(cmd_original)
|
|
||||||
elif canonical == "skin":
|
elif canonical == "skin":
|
||||||
self._handle_skin_command(cmd_original)
|
self._handle_skin_command(cmd_original)
|
||||||
elif canonical == "voice":
|
elif canonical == "voice":
|
||||||
|
|
@ -7850,103 +7848,6 @@ class HermesCLI:
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _handle_subgoal_command(self, cmd: str) -> None:
|
|
||||||
"""Dispatch /subgoal subcommands.
|
|
||||||
|
|
||||||
Forms:
|
|
||||||
/subgoal show the checklist
|
|
||||||
/subgoal <text> append a user item
|
|
||||||
/subgoal complete <n> mark item n completed
|
|
||||||
/subgoal impossible <n> mark item n impossible
|
|
||||||
/subgoal undo <n> revert item n to pending
|
|
||||||
/subgoal remove <n> delete item n
|
|
||||||
/subgoal clear wipe the checklist (judge re-decomposes)
|
|
||||||
"""
|
|
||||||
parts = (cmd or "").strip().split(None, 2)
|
|
||||||
# parts[0] == "/subgoal"; remainder is what the user typed
|
|
||||||
arg = " ".join(parts[1:]).strip() if len(parts) > 1 else ""
|
|
||||||
|
|
||||||
mgr = self._get_goal_manager()
|
|
||||||
if mgr is None:
|
|
||||||
_cprint(f" {_DIM}Goals unavailable (no active session).{_RST}")
|
|
||||||
return
|
|
||||||
|
|
||||||
if not mgr.has_goal():
|
|
||||||
_cprint(f" {_DIM}No active goal. Set one with /goal <text>.{_RST}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# No args → show the checklist.
|
|
||||||
if not arg:
|
|
||||||
_cprint(f" {mgr.status_line()}")
|
|
||||||
_cprint(f" {mgr.render_checklist()}")
|
|
||||||
return
|
|
||||||
|
|
||||||
tokens = arg.split(None, 1)
|
|
||||||
verb = tokens[0].lower()
|
|
||||||
rest = tokens[1].strip() if len(tokens) > 1 else ""
|
|
||||||
|
|
||||||
# Action verbs operate on indices.
|
|
||||||
action_status_map = {
|
|
||||||
"complete": "completed",
|
|
||||||
"completed": "completed",
|
|
||||||
"done": "completed",
|
|
||||||
"impossible": "impossible",
|
|
||||||
"imp": "impossible",
|
|
||||||
"skip": "impossible",
|
|
||||||
"undo": "pending",
|
|
||||||
"pending": "pending",
|
|
||||||
"reset": "pending",
|
|
||||||
}
|
|
||||||
if verb in action_status_map:
|
|
||||||
if not rest:
|
|
||||||
_cprint(f" Usage: /subgoal {verb} <n>")
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
idx = int(rest.split()[0])
|
|
||||||
except ValueError:
|
|
||||||
_cprint(f" /subgoal {verb}: <n> must be an integer (1-based index).")
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
item = mgr.mark_subgoal(idx, action_status_map[verb])
|
|
||||||
except (IndexError, ValueError, RuntimeError) as exc:
|
|
||||||
_cprint(f" /subgoal {verb}: {exc}")
|
|
||||||
return
|
|
||||||
_cprint(f" ✓ Item {idx} → {item.status}: {item.text}")
|
|
||||||
return
|
|
||||||
|
|
||||||
if verb == "remove":
|
|
||||||
if not rest:
|
|
||||||
_cprint(" Usage: /subgoal remove <n>")
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
idx = int(rest.split()[0])
|
|
||||||
except ValueError:
|
|
||||||
_cprint(" /subgoal remove: <n> must be an integer (1-based index).")
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
removed = mgr.remove_subgoal(idx)
|
|
||||||
except (IndexError, RuntimeError) as exc:
|
|
||||||
_cprint(f" /subgoal remove: {exc}")
|
|
||||||
return
|
|
||||||
_cprint(f" ✓ Removed item {idx}: {removed.text}")
|
|
||||||
return
|
|
||||||
|
|
||||||
if verb == "clear":
|
|
||||||
mgr.clear_checklist()
|
|
||||||
_cprint(
|
|
||||||
" ✓ Checklist cleared. The judge will re-decompose on the next turn."
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
# Otherwise: append `arg` as a user-authored checklist item.
|
|
||||||
try:
|
|
||||||
item = mgr.add_subgoal(arg)
|
|
||||||
except (ValueError, RuntimeError) as exc:
|
|
||||||
_cprint(f" /subgoal: {exc}")
|
|
||||||
return
|
|
||||||
idx = len(mgr.state.checklist) if mgr.state else 0
|
|
||||||
_cprint(f" ✓ Added subgoal {idx}: {item.text}")
|
|
||||||
|
|
||||||
def _maybe_continue_goal_after_turn(self) -> None:
|
def _maybe_continue_goal_after_turn(self) -> None:
|
||||||
"""Hook run after every CLI turn. Judges + maybe re-queues.
|
"""Hook run after every CLI turn. Judges + maybe re-queues.
|
||||||
|
|
||||||
|
|
@ -8024,11 +7925,7 @@ class HermesCLI:
|
||||||
if not last_response.strip():
|
if not last_response.strip():
|
||||||
return
|
return
|
||||||
|
|
||||||
decision = mgr.evaluate_after_turn(
|
decision = mgr.evaluate_after_turn(last_response, user_initiated=True)
|
||||||
last_response,
|
|
||||||
user_initiated=True,
|
|
||||||
messages=getattr(self, "conversation_history", None) or [],
|
|
||||||
)
|
|
||||||
msg = decision.get("message") or ""
|
msg = decision.get("message") or ""
|
||||||
if msg:
|
if msg:
|
||||||
_cprint(f" {msg}")
|
_cprint(f" {msg}")
|
||||||
|
|
|
||||||
101
gateway/run.py
101
gateway/run.py
|
|
@ -6083,12 +6083,6 @@ class GatewayRunner:
|
||||||
return await self._handle_goal_command(event)
|
return await self._handle_goal_command(event)
|
||||||
return "Agent is running — use /goal status / pause / clear mid-run, or /stop before setting a new goal."
|
return "Agent is running — use /goal status / pause / clear mid-run, or /stop before setting a new goal."
|
||||||
|
|
||||||
# /subgoal is safe mid-run — it only modifies the active goal's
|
|
||||||
# checklist, which the judge consults at turn boundaries. There
|
|
||||||
# is no race with the running turn.
|
|
||||||
if _cmd_def_inner and _cmd_def_inner.name == "subgoal":
|
|
||||||
return await self._handle_subgoal_command(event)
|
|
||||||
|
|
||||||
# Session-level toggles that are safe to run mid-agent —
|
# Session-level toggles that are safe to run mid-agent —
|
||||||
# /yolo can unblock a pending approval prompt, /verbose cycles
|
# /yolo can unblock a pending approval prompt, /verbose cycles
|
||||||
# the tool-progress display mode for the ongoing stream.
|
# the tool-progress display mode for the ongoing stream.
|
||||||
|
|
@ -6467,9 +6461,6 @@ class GatewayRunner:
|
||||||
if canonical == "goal":
|
if canonical == "goal":
|
||||||
return await self._handle_goal_command(event)
|
return await self._handle_goal_command(event)
|
||||||
|
|
||||||
if canonical == "subgoal":
|
|
||||||
return await self._handle_subgoal_command(event)
|
|
||||||
|
|
||||||
if canonical == "voice":
|
if canonical == "voice":
|
||||||
return await self._handle_voice_command(event)
|
return await self._handle_voice_command(event)
|
||||||
|
|
||||||
|
|
@ -6654,18 +6645,10 @@ class GatewayRunner:
|
||||||
except Exception:
|
except Exception:
|
||||||
session_entry = None
|
session_entry = None
|
||||||
if session_entry is not None:
|
if session_entry is not None:
|
||||||
# Pull the agent's full messages list from the result
|
|
||||||
# so the judge can dump it for its read_file tool.
|
|
||||||
_agent_messages: list = []
|
|
||||||
if isinstance(_agent_result, dict):
|
|
||||||
_msgs = _agent_result.get("messages")
|
|
||||||
if isinstance(_msgs, list):
|
|
||||||
_agent_messages = _msgs
|
|
||||||
await self._post_turn_goal_continuation(
|
await self._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
source=source,
|
source=source,
|
||||||
final_response=_final_text,
|
final_response=_final_text,
|
||||||
agent_messages=_agent_messages,
|
|
||||||
)
|
)
|
||||||
except Exception as _goal_exc:
|
except Exception as _goal_exc:
|
||||||
logger.debug("goal continuation hook failed: %s", _goal_exc)
|
logger.debug("goal continuation hook failed: %s", _goal_exc)
|
||||||
|
|
@ -9402,83 +9385,6 @@ class GatewayRunner:
|
||||||
|
|
||||||
return t("gateway.goal.set", budget=state.max_turns, goal=state.goal)
|
return t("gateway.goal.set", budget=state.max_turns, goal=state.goal)
|
||||||
|
|
||||||
async def _handle_subgoal_command(self, event: "MessageEvent") -> str:
|
|
||||||
"""Handle /subgoal for gateway platforms.
|
|
||||||
|
|
||||||
Forms (mirror of CLI):
|
|
||||||
/subgoal show the checklist
|
|
||||||
/subgoal <text> append a user item
|
|
||||||
/subgoal complete <n> | done <n> mark item n completed
|
|
||||||
/subgoal impossible <n> mark item n impossible
|
|
||||||
/subgoal undo <n> revert item n to pending
|
|
||||||
/subgoal remove <n> delete item n
|
|
||||||
/subgoal clear wipe the checklist
|
|
||||||
"""
|
|
||||||
args = (event.get_command_args() or "").strip()
|
|
||||||
|
|
||||||
mgr, _session_entry = self._get_goal_manager_for_event(event)
|
|
||||||
if mgr is None:
|
|
||||||
return t("gateway.goal.unavailable")
|
|
||||||
|
|
||||||
if not mgr.has_goal():
|
|
||||||
return "No active goal. Set one with /goal <text>."
|
|
||||||
|
|
||||||
if not args:
|
|
||||||
return f"{mgr.status_line()}\n{mgr.render_checklist()}"
|
|
||||||
|
|
||||||
tokens = args.split(None, 1)
|
|
||||||
verb = tokens[0].lower()
|
|
||||||
rest = tokens[1].strip() if len(tokens) > 1 else ""
|
|
||||||
|
|
||||||
action_status_map = {
|
|
||||||
"complete": "completed",
|
|
||||||
"completed": "completed",
|
|
||||||
"done": "completed",
|
|
||||||
"impossible": "impossible",
|
|
||||||
"imp": "impossible",
|
|
||||||
"skip": "impossible",
|
|
||||||
"undo": "pending",
|
|
||||||
"pending": "pending",
|
|
||||||
"reset": "pending",
|
|
||||||
}
|
|
||||||
if verb in action_status_map:
|
|
||||||
if not rest:
|
|
||||||
return f"Usage: /subgoal {verb} <n>"
|
|
||||||
try:
|
|
||||||
idx = int(rest.split()[0])
|
|
||||||
except ValueError:
|
|
||||||
return f"/subgoal {verb}: <n> must be an integer (1-based index)."
|
|
||||||
try:
|
|
||||||
item = mgr.mark_subgoal(idx, action_status_map[verb])
|
|
||||||
except (IndexError, ValueError, RuntimeError) as exc:
|
|
||||||
return f"/subgoal {verb}: {exc}"
|
|
||||||
return f"✓ Item {idx} → {item.status}: {item.text}"
|
|
||||||
|
|
||||||
if verb == "remove":
|
|
||||||
if not rest:
|
|
||||||
return "Usage: /subgoal remove <n>"
|
|
||||||
try:
|
|
||||||
idx = int(rest.split()[0])
|
|
||||||
except ValueError:
|
|
||||||
return "/subgoal remove: <n> must be an integer (1-based index)."
|
|
||||||
try:
|
|
||||||
removed = mgr.remove_subgoal(idx)
|
|
||||||
except (IndexError, RuntimeError) as exc:
|
|
||||||
return f"/subgoal remove: {exc}"
|
|
||||||
return f"✓ Removed item {idx}: {removed.text}"
|
|
||||||
|
|
||||||
if verb == "clear":
|
|
||||||
mgr.clear_checklist()
|
|
||||||
return "✓ Checklist cleared. The judge will re-decompose on the next turn."
|
|
||||||
|
|
||||||
# Otherwise — append `args` as a new user-authored checklist item.
|
|
||||||
try:
|
|
||||||
item = mgr.add_subgoal(args)
|
|
||||||
except (ValueError, RuntimeError) as exc:
|
|
||||||
return f"/subgoal: {exc}"
|
|
||||||
idx = len(mgr.state.checklist) if mgr.state else 0
|
|
||||||
return f"✓ Added subgoal {idx}: {item.text}"
|
|
||||||
|
|
||||||
async def _send_goal_status_notice(self, source: Any, message: str) -> None:
|
async def _send_goal_status_notice(self, source: Any, message: str) -> None:
|
||||||
"""Send a /goal judge status line back to the originating chat/thread."""
|
"""Send a /goal judge status line back to the originating chat/thread."""
|
||||||
adapter = self.adapters.get(source.platform)
|
adapter = self.adapters.get(source.platform)
|
||||||
|
|
@ -9547,7 +9453,6 @@ class GatewayRunner:
|
||||||
session_entry: Any,
|
session_entry: Any,
|
||||||
source: Any,
|
source: Any,
|
||||||
final_response: str,
|
final_response: str,
|
||||||
agent_messages: Optional[list] = None,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Run the goal judge after a gateway turn and, if still active,
|
"""Run the goal judge after a gateway turn and, if still active,
|
||||||
enqueue a continuation prompt for the same session.
|
enqueue a continuation prompt for the same session.
|
||||||
|
|
@ -9575,11 +9480,7 @@ class GatewayRunner:
|
||||||
if not mgr.is_active():
|
if not mgr.is_active():
|
||||||
return
|
return
|
||||||
|
|
||||||
decision = mgr.evaluate_after_turn(
|
decision = mgr.evaluate_after_turn(final_response or "", user_initiated=True)
|
||||||
final_response or "",
|
|
||||||
user_initiated=True,
|
|
||||||
messages=agent_messages or [],
|
|
||||||
)
|
|
||||||
msg = decision.get("message") or ""
|
msg = decision.get("message") or ""
|
||||||
|
|
||||||
# Defer the status line until after the adapter has delivered the
|
# Defer the status line until after the adapter has delivered the
|
||||||
|
|
|
||||||
|
|
@ -104,8 +104,6 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||||
args_hint="<prompt>"),
|
args_hint="<prompt>"),
|
||||||
CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session",
|
CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session",
|
||||||
args_hint="[text | pause | resume | clear | status]"),
|
args_hint="[text | pause | resume | clear | status]"),
|
||||||
CommandDef("subgoal", "Add or manage checklist items on the active goal", "Session",
|
|
||||||
args_hint="[text | complete N | impossible N | undo N | remove N | clear]"),
|
|
||||||
CommandDef("status", "Show session info", "Session"),
|
CommandDef("status", "Show session info", "Session"),
|
||||||
CommandDef("whoami", "Show your slash command access (admin / user)", "Info"),
|
CommandDef("whoami", "Show your slash command access (admin / user)", "Info"),
|
||||||
CommandDef("profile", "Show active profile name and home directory", "Info"),
|
CommandDef("profile", "Show active profile name and home directory", "Info"),
|
||||||
|
|
|
||||||
1438
hermes_cli/goals.py
1438
hermes_cli/goals.py
File diff suppressed because it is too large
Load diff
19
run_agent.py
19
run_agent.py
|
|
@ -10089,25 +10089,6 @@ class AIAgent:
|
||||||
parent_session_id=old_session_id,
|
parent_session_id=old_session_id,
|
||||||
)
|
)
|
||||||
self._session_db_created = True
|
self._session_db_created = True
|
||||||
# Forward any standing /goal state from the parent session to
|
|
||||||
# the continuation session so the goal loop survives
|
|
||||||
# auto-compression. Without this rebind, _get_goal_manager()
|
|
||||||
# constructs a fresh manager keyed on the new session_id,
|
|
||||||
# load_goal() returns None, mgr.is_active() is False, and
|
|
||||||
# the loop silently dies mid-task. The goal is stored in
|
|
||||||
# state_meta under "goal:<sid>" by hermes_cli.goals.
|
|
||||||
try:
|
|
||||||
_goal_meta_key_old = f"goal:{old_session_id}"
|
|
||||||
_goal_meta_key_new = f"goal:{self.session_id}"
|
|
||||||
_goal_blob = self._session_db.get_meta(_goal_meta_key_old)
|
|
||||||
if _goal_blob:
|
|
||||||
self._session_db.set_meta(_goal_meta_key_new, _goal_blob)
|
|
||||||
logger.info(
|
|
||||||
"goal: forwarded standing goal from %s → %s on compression",
|
|
||||||
old_session_id, self.session_id,
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.debug("goal forward on compression failed: %s", exc)
|
|
||||||
# Auto-number the title for the continuation session
|
# Auto-number the title for the continuation session
|
||||||
if old_title:
|
if old_title:
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -58,11 +58,6 @@ def _make_cli_with_goal(session_id: str, goal_text: str = "build a thing"):
|
||||||
|
|
||||||
mgr = GoalManager(session_id=session_id, default_max_turns=5)
|
mgr = GoalManager(session_id=session_id, default_max_turns=5)
|
||||||
mgr.set(goal_text)
|
mgr.set(goal_text)
|
||||||
# Skip Phase-A decompose so tests can patch judge_goal_freeform directly
|
|
||||||
# for legacy verdict assertions.
|
|
||||||
mgr.state.decomposed = True
|
|
||||||
from hermes_cli.goals import save_goal as _sg
|
|
||||||
_sg(mgr.session_id, mgr.state)
|
|
||||||
cli._goal_manager = mgr
|
cli._goal_manager = mgr
|
||||||
return cli, mgr
|
return cli, mgr
|
||||||
|
|
||||||
|
|
@ -86,7 +81,7 @@ class TestInterruptAutoPause:
|
||||||
|
|
||||||
# Judge MUST NOT run on an interrupted turn. If it does, we've
|
# Judge MUST NOT run on an interrupted turn. If it does, we've
|
||||||
# regressed — fail loudly instead of silently querying a mock.
|
# regressed — fail loudly instead of silently querying a mock.
|
||||||
with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock:
|
with patch("hermes_cli.goals.judge_goal") as judge_mock:
|
||||||
judge_mock.side_effect = AssertionError(
|
judge_mock.side_effect = AssertionError(
|
||||||
"judge_goal called on an interrupted turn"
|
"judge_goal called on an interrupted turn"
|
||||||
)
|
)
|
||||||
|
|
@ -111,7 +106,7 @@ class TestInterruptAutoPause:
|
||||||
cli.conversation_history = [
|
cli.conversation_history = [
|
||||||
{"role": "assistant", "content": "partial"},
|
{"role": "assistant", "content": "partial"},
|
||||||
]
|
]
|
||||||
with patch("hermes_cli.goals.judge_goal_freeform"):
|
with patch("hermes_cli.goals.judge_goal"):
|
||||||
cli._maybe_continue_goal_after_turn()
|
cli._maybe_continue_goal_after_turn()
|
||||||
assert mgr.state.status == "paused"
|
assert mgr.state.status == "paused"
|
||||||
|
|
||||||
|
|
@ -130,7 +125,7 @@ class TestEmptyResponseSkip:
|
||||||
{"role": "assistant", "content": " \n\n "},
|
{"role": "assistant", "content": " \n\n "},
|
||||||
]
|
]
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock:
|
with patch("hermes_cli.goals.judge_goal") as judge_mock:
|
||||||
judge_mock.side_effect = AssertionError(
|
judge_mock.side_effect = AssertionError(
|
||||||
"judge_goal called on an empty response"
|
"judge_goal called on an empty response"
|
||||||
)
|
)
|
||||||
|
|
@ -149,7 +144,7 @@ class TestEmptyResponseSkip:
|
||||||
{"role": "user", "content": "go"},
|
{"role": "user", "content": "go"},
|
||||||
]
|
]
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock:
|
with patch("hermes_cli.goals.judge_goal") as judge_mock:
|
||||||
judge_mock.side_effect = AssertionError(
|
judge_mock.side_effect = AssertionError(
|
||||||
"judge_goal called without an assistant response"
|
"judge_goal called without an assistant response"
|
||||||
)
|
)
|
||||||
|
|
@ -174,7 +169,7 @@ class TestHealthyTurnStillRuns:
|
||||||
|
|
||||||
# Force the judge to say "continue" without touching the network.
|
# Force the judge to say "continue" without touching the network.
|
||||||
with patch(
|
with patch(
|
||||||
"hermes_cli.goals.judge_goal_freeform",
|
"hermes_cli.goals.judge_goal",
|
||||||
return_value=("continue", "needs more steps", False),
|
return_value=("continue", "needs more steps", False),
|
||||||
):
|
):
|
||||||
cli._maybe_continue_goal_after_turn()
|
cli._maybe_continue_goal_after_turn()
|
||||||
|
|
@ -194,7 +189,7 @@ class TestHealthyTurnStillRuns:
|
||||||
]
|
]
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"hermes_cli.goals.judge_goal_freeform",
|
"hermes_cli.goals.judge_goal",
|
||||||
return_value=("done", "goal satisfied", False),
|
return_value=("done", "goal satisfied", False),
|
||||||
):
|
):
|
||||||
cli._maybe_continue_goal_after_turn()
|
cli._maybe_continue_goal_after_turn()
|
||||||
|
|
|
||||||
|
|
@ -106,11 +106,8 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home):
|
||||||
|
|
||||||
mgr = GoalManager(session_entry.session_id)
|
mgr = GoalManager(session_entry.session_id)
|
||||||
mgr.set("ship the feature")
|
mgr.set("ship the feature")
|
||||||
mgr.state.decomposed = True
|
|
||||||
from hermes_cli.goals import save_goal as _sg
|
|
||||||
_sg(mgr.session_id, mgr.state)
|
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "the feature shipped", False)):
|
with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)):
|
||||||
await runner._post_turn_goal_continuation(
|
await runner._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
source=src,
|
source=src,
|
||||||
|
|
@ -138,11 +135,8 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home):
|
||||||
|
|
||||||
mgr = GoalManager(session_entry.session_id)
|
mgr = GoalManager(session_entry.session_id)
|
||||||
mgr.set("polish the docs")
|
mgr.set("polish the docs")
|
||||||
mgr.state.decomposed = True
|
|
||||||
from hermes_cli.goals import save_goal as _sg
|
|
||||||
_sg(mgr.session_id, mgr.state)
|
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "still needs work", False)):
|
with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)):
|
||||||
await runner._post_turn_goal_continuation(
|
await runner._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
source=src,
|
source=src,
|
||||||
|
|
@ -170,7 +164,7 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home):
|
||||||
state.turns_used = 2
|
state.turns_used = 2
|
||||||
save_goal(session_entry.session_id, state)
|
save_goal(session_entry.session_id, state)
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "keep going", False)):
|
with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)):
|
||||||
await runner._post_turn_goal_continuation(
|
await runner._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
source=src,
|
source=src,
|
||||||
|
|
@ -217,7 +211,7 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home):
|
||||||
|
|
||||||
runner.adapters[Platform.TELEGRAM] = _NoSendAdapter()
|
runner.adapters[Platform.TELEGRAM] = _NoSendAdapter()
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "ok", False)):
|
with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)):
|
||||||
# must not raise
|
# must not raise
|
||||||
await runner._post_turn_goal_continuation(
|
await runner._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
|
|
|
||||||
|
|
@ -253,20 +253,14 @@ class TestGoalManager:
|
||||||
assert mgr2.is_active()
|
assert mgr2.is_active()
|
||||||
|
|
||||||
def test_evaluate_after_turn_done(self, hermes_home):
|
def test_evaluate_after_turn_done(self, hermes_home):
|
||||||
"""Judge says done → status=done, no continuation.
|
"""Judge says done → status=done, no continuation."""
|
||||||
|
|
||||||
Skips Phase-A decompose by patching ``decompose_goal`` to return
|
|
||||||
an empty checklist so the manager falls through to the freeform
|
|
||||||
judge path (legacy behavior preserved when decompose is unavailable).
|
|
||||||
"""
|
|
||||||
from hermes_cli import goals
|
from hermes_cli import goals
|
||||||
from hermes_cli.goals import GoalManager
|
from hermes_cli.goals import GoalManager
|
||||||
|
|
||||||
mgr = GoalManager(session_id="eval-sid-1")
|
mgr = GoalManager(session_id="eval-sid-1")
|
||||||
mgr.set("ship it")
|
mgr.set("ship it")
|
||||||
|
|
||||||
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)):
|
||||||
patch.object(goals, "judge_goal_freeform", return_value=("done", "shipped", False)):
|
|
||||||
decision = mgr.evaluate_after_turn("I shipped the feature.")
|
decision = mgr.evaluate_after_turn("I shipped the feature.")
|
||||||
|
|
||||||
assert decision["verdict"] == "done"
|
assert decision["verdict"] == "done"
|
||||||
|
|
@ -282,8 +276,7 @@ class TestGoalManager:
|
||||||
mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
|
mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
|
||||||
mgr.set("a long goal")
|
mgr.set("a long goal")
|
||||||
|
|
||||||
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)):
|
||||||
patch.object(goals, "judge_goal_freeform", return_value=("continue", "more work", False)):
|
|
||||||
decision = mgr.evaluate_after_turn("made some progress")
|
decision = mgr.evaluate_after_turn("made some progress")
|
||||||
|
|
||||||
assert decision["verdict"] == "continue"
|
assert decision["verdict"] == "continue"
|
||||||
|
|
@ -301,8 +294,7 @@ class TestGoalManager:
|
||||||
mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
|
mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
|
||||||
mgr.set("hard goal")
|
mgr.set("hard goal")
|
||||||
|
|
||||||
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)):
|
||||||
patch.object(goals, "judge_goal_freeform", return_value=("continue", "not yet", False)):
|
|
||||||
d1 = mgr.evaluate_after_turn("step 1")
|
d1 = mgr.evaluate_after_turn("step 1")
|
||||||
assert d1["should_continue"] is True
|
assert d1["should_continue"] is True
|
||||||
assert mgr.state.turns_used == 1
|
assert mgr.state.turns_used == 1
|
||||||
|
|
@ -442,10 +434,8 @@ class TestJudgeParseFailureAutoPause:
|
||||||
mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20)
|
mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20)
|
||||||
mgr.set("do a thing")
|
mgr.set("do a thing")
|
||||||
|
|
||||||
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
with patch.object(
|
||||||
patch.object(
|
goals, "judge_goal", return_value=("continue", "judge returned empty response", True)
|
||||||
goals, "judge_goal_freeform",
|
|
||||||
return_value=("continue", "judge returned empty response", True),
|
|
||||||
):
|
):
|
||||||
d1 = mgr.evaluate_after_turn("step 1")
|
d1 = mgr.evaluate_after_turn("step 1")
|
||||||
assert d1["should_continue"] is True
|
assert d1["should_continue"] is True
|
||||||
|
|
@ -473,20 +463,16 @@ class TestJudgeParseFailureAutoPause:
|
||||||
mgr.set("another goal")
|
mgr.set("another goal")
|
||||||
|
|
||||||
# Two parse failures…
|
# Two parse failures…
|
||||||
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
with patch.object(
|
||||||
patch.object(
|
goals, "judge_goal", return_value=("continue", "not json", True)
|
||||||
goals, "judge_goal_freeform",
|
|
||||||
return_value=("continue", "not json", True),
|
|
||||||
):
|
):
|
||||||
mgr.evaluate_after_turn("step 1")
|
mgr.evaluate_after_turn("step 1")
|
||||||
mgr.evaluate_after_turn("step 2")
|
mgr.evaluate_after_turn("step 2")
|
||||||
assert mgr.state.consecutive_parse_failures == 2
|
assert mgr.state.consecutive_parse_failures == 2
|
||||||
|
|
||||||
# …then one clean reply resets the counter.
|
# …then one clean reply resets the counter.
|
||||||
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
with patch.object(
|
||||||
patch.object(
|
goals, "judge_goal", return_value=("continue", "making progress", False)
|
||||||
goals, "judge_goal_freeform",
|
|
||||||
return_value=("continue", "making progress", False),
|
|
||||||
):
|
):
|
||||||
d = mgr.evaluate_after_turn("step 3")
|
d = mgr.evaluate_after_turn("step 3")
|
||||||
assert d["should_continue"] is True
|
assert d["should_continue"] is True
|
||||||
|
|
@ -500,10 +486,8 @@ class TestJudgeParseFailureAutoPause:
|
||||||
mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20)
|
mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20)
|
||||||
mgr.set("goal")
|
mgr.set("goal")
|
||||||
|
|
||||||
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
with patch.object(
|
||||||
patch.object(
|
goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False)
|
||||||
goals, "judge_goal_freeform",
|
|
||||||
return_value=("continue", "judge error: RuntimeError", False),
|
|
||||||
):
|
):
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
d = mgr.evaluate_after_turn("still going")
|
d = mgr.evaluate_after_turn("still going")
|
||||||
|
|
@ -521,10 +505,8 @@ class TestJudgeParseFailureAutoPause:
|
||||||
mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20)
|
mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20)
|
||||||
mgr.set("persistent goal")
|
mgr.set("persistent goal")
|
||||||
|
|
||||||
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
with patch.object(
|
||||||
patch.object(
|
goals, "judge_goal", return_value=("continue", "empty", True)
|
||||||
goals, "judge_goal_freeform",
|
|
||||||
return_value=("continue", "empty", True),
|
|
||||||
):
|
):
|
||||||
mgr.evaluate_after_turn("r")
|
mgr.evaluate_after_turn("r")
|
||||||
mgr.evaluate_after_turn("r")
|
mgr.evaluate_after_turn("r")
|
||||||
|
|
@ -532,846 +514,3 @@ class TestJudgeParseFailureAutoPause:
|
||||||
reloaded = load_goal("parse-fail-sid-4")
|
reloaded = load_goal("parse-fail-sid-4")
|
||||||
assert reloaded is not None
|
assert reloaded is not None
|
||||||
assert reloaded.consecutive_parse_failures == 2
|
assert reloaded.consecutive_parse_failures == 2
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
# Checklist mode: GoalState backcompat + ChecklistItem
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class TestGoalStateBackcompat:
|
|
||||||
def test_old_state_meta_row_loads_without_checklist_fields(self):
|
|
||||||
"""A goal serialized BEFORE the checklist fields existed must
|
|
||||||
round-trip through GoalState.from_json with empty defaults."""
|
|
||||||
from hermes_cli.goals import GoalState
|
|
||||||
|
|
||||||
legacy_json = json.dumps({
|
|
||||||
"goal": "do the thing",
|
|
||||||
"status": "active",
|
|
||||||
"turns_used": 3,
|
|
||||||
"max_turns": 20,
|
|
||||||
"created_at": 1.0,
|
|
||||||
"last_turn_at": 2.0,
|
|
||||||
"last_verdict": "continue",
|
|
||||||
"last_reason": "still working",
|
|
||||||
"paused_reason": None,
|
|
||||||
"consecutive_parse_failures": 1,
|
|
||||||
})
|
|
||||||
state = GoalState.from_json(legacy_json)
|
|
||||||
assert state.goal == "do the thing"
|
|
||||||
assert state.checklist == []
|
|
||||||
assert state.decomposed is False
|
|
||||||
|
|
||||||
def test_new_state_round_trip(self):
|
|
||||||
from hermes_cli.goals import (
|
|
||||||
ChecklistItem,
|
|
||||||
GoalState,
|
|
||||||
ITEM_COMPLETED,
|
|
||||||
ITEM_PENDING,
|
|
||||||
ADDED_BY_JUDGE,
|
|
||||||
ADDED_BY_USER,
|
|
||||||
)
|
|
||||||
|
|
||||||
state = GoalState(
|
|
||||||
goal="g",
|
|
||||||
decomposed=True,
|
|
||||||
checklist=[
|
|
||||||
ChecklistItem(text="a", status=ITEM_COMPLETED,
|
|
||||||
added_by=ADDED_BY_JUDGE, evidence="done"),
|
|
||||||
ChecklistItem(text="b", status=ITEM_PENDING,
|
|
||||||
added_by=ADDED_BY_USER),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
round_tripped = GoalState.from_json(state.to_json())
|
|
||||||
assert round_tripped.decomposed is True
|
|
||||||
assert len(round_tripped.checklist) == 2
|
|
||||||
assert round_tripped.checklist[0].text == "a"
|
|
||||||
assert round_tripped.checklist[0].status == ITEM_COMPLETED
|
|
||||||
assert round_tripped.checklist[0].evidence == "done"
|
|
||||||
assert round_tripped.checklist[1].added_by == ADDED_BY_USER
|
|
||||||
|
|
||||||
def test_checklist_counts_and_all_terminal(self):
|
|
||||||
from hermes_cli.goals import (
|
|
||||||
ChecklistItem, GoalState,
|
|
||||||
ITEM_COMPLETED, ITEM_IMPOSSIBLE, ITEM_PENDING,
|
|
||||||
)
|
|
||||||
|
|
||||||
state = GoalState(
|
|
||||||
goal="g",
|
|
||||||
checklist=[
|
|
||||||
ChecklistItem(text="a", status=ITEM_COMPLETED),
|
|
||||||
ChecklistItem(text="b", status=ITEM_IMPOSSIBLE),
|
|
||||||
ChecklistItem(text="c", status=ITEM_PENDING),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
total, done, imp, pending = state.checklist_counts()
|
|
||||||
assert (total, done, imp, pending) == (3, 1, 1, 1)
|
|
||||||
assert state.all_terminal() is False
|
|
||||||
|
|
||||||
state.checklist[2].status = ITEM_IMPOSSIBLE
|
|
||||||
assert state.all_terminal() is True
|
|
||||||
|
|
||||||
def test_empty_checklist_is_not_all_terminal(self):
|
|
||||||
"""Empty list must NOT be considered done."""
|
|
||||||
from hermes_cli.goals import GoalState
|
|
||||||
|
|
||||||
state = GoalState(goal="g")
|
|
||||||
assert state.all_terminal() is False
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
# Phase A: decompose
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class TestPhaseADecompose:
|
|
||||||
def test_decompose_writes_checklist_and_marks_decomposed(self, hermes_home):
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_JUDGE
|
|
||||||
|
|
||||||
mgr = GoalManager(session_id="phase-a-sid-1")
|
|
||||||
mgr.set("build a website")
|
|
||||||
|
|
||||||
items = [{"text": "homepage exists"}, {"text": "is mobile-friendly"}]
|
|
||||||
with patch.object(goals, "decompose_goal", return_value=(items, None)):
|
|
||||||
d = mgr.evaluate_after_turn("(initial response)")
|
|
||||||
|
|
||||||
assert d["verdict"] == "decompose"
|
|
||||||
assert d["should_continue"] is True
|
|
||||||
# Phase A produces a continuation prompt that includes the checklist.
|
|
||||||
assert d["continuation_prompt"] is not None
|
|
||||||
assert "Checklist progress" in d["continuation_prompt"]
|
|
||||||
assert mgr.state.decomposed is True
|
|
||||||
assert len(mgr.state.checklist) == 2
|
|
||||||
assert mgr.state.checklist[0].text == "homepage exists"
|
|
||||||
assert mgr.state.checklist[0].status == ITEM_PENDING
|
|
||||||
assert mgr.state.checklist[0].added_by == ADDED_BY_JUDGE
|
|
||||||
|
|
||||||
def test_decompose_only_runs_once(self, hermes_home):
|
|
||||||
"""Decomposed=True after first call. Subsequent calls go to Phase B."""
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import GoalManager
|
|
||||||
|
|
||||||
mgr = GoalManager(session_id="phase-a-sid-2")
|
|
||||||
mgr.set("g")
|
|
||||||
|
|
||||||
with patch.object(
|
|
||||||
goals, "decompose_goal", return_value=([{"text": "x"}], None)
|
|
||||||
) as decompose_mock, patch.object(
|
|
||||||
goals, "evaluate_checklist",
|
|
||||||
return_value=({"updates": [], "new_items": [], "reason": "..."}, False),
|
|
||||||
) as eval_mock:
|
|
||||||
mgr.evaluate_after_turn("turn 1")
|
|
||||||
mgr.evaluate_after_turn("turn 2")
|
|
||||||
mgr.evaluate_after_turn("turn 3")
|
|
||||||
|
|
||||||
assert decompose_mock.call_count == 1
|
|
||||||
assert eval_mock.call_count == 2
|
|
||||||
|
|
||||||
def test_decompose_failure_falls_back_to_freeform(self, hermes_home):
|
|
||||||
"""If decompose returns no items, manager falls through to freeform judge."""
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import GoalManager
|
|
||||||
|
|
||||||
mgr = GoalManager(session_id="phase-a-sid-3")
|
|
||||||
mgr.set("g")
|
|
||||||
|
|
||||||
with patch.object(goals, "decompose_goal", return_value=([], "model error")), \
|
|
||||||
patch.object(goals, "judge_goal_freeform",
|
|
||||||
return_value=("done", "shipped", False)):
|
|
||||||
d = mgr.evaluate_after_turn("done!")
|
|
||||||
|
|
||||||
assert d["verdict"] == "done"
|
|
||||||
assert mgr.state.decomposed is True
|
|
||||||
assert mgr.state.checklist == []
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
# Phase B: evaluate (checklist mode)
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class TestPhaseBChecklist:
|
|
||||||
def _make_decomposed_mgr(self, sid: str, items):
|
|
||||||
"""Helper: skip Phase A, install a decomposed checklist directly."""
|
|
||||||
from hermes_cli.goals import (
|
|
||||||
GoalManager, ChecklistItem, ITEM_PENDING, ADDED_BY_JUDGE,
|
|
||||||
)
|
|
||||||
from hermes_cli import goals as _g
|
|
||||||
mgr = GoalManager(session_id=sid)
|
|
||||||
mgr.set("a goal")
|
|
||||||
mgr.state.decomposed = True
|
|
||||||
mgr.state.checklist = [
|
|
||||||
ChecklistItem(text=t, status=ITEM_PENDING, added_by=ADDED_BY_JUDGE)
|
|
||||||
for t in items
|
|
||||||
]
|
|
||||||
_g.save_goal(sid, mgr.state)
|
|
||||||
return mgr
|
|
||||||
|
|
||||||
def test_judge_flips_pending_to_completed(self, hermes_home):
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import ITEM_COMPLETED, ITEM_PENDING
|
|
||||||
|
|
||||||
mgr = self._make_decomposed_mgr("phase-b-1", ["a", "b", "c"])
|
|
||||||
with patch.object(
|
|
||||||
goals, "evaluate_checklist",
|
|
||||||
return_value=(
|
|
||||||
{
|
|
||||||
"updates": [
|
|
||||||
{"index": 0, "status": "completed", "evidence": "done"},
|
|
||||||
{"index": 1, "status": "completed", "evidence": "shipped"},
|
|
||||||
],
|
|
||||||
"new_items": [],
|
|
||||||
"reason": "made progress",
|
|
||||||
},
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
d = mgr.evaluate_after_turn("agent did stuff")
|
|
||||||
|
|
||||||
assert d["verdict"] == "continue"
|
|
||||||
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
||||||
assert mgr.state.checklist[0].evidence == "done"
|
|
||||||
assert mgr.state.checklist[1].status == ITEM_COMPLETED
|
|
||||||
assert mgr.state.checklist[2].status == ITEM_PENDING
|
|
||||||
|
|
||||||
def test_goal_done_when_all_items_terminal(self, hermes_home):
|
|
||||||
from hermes_cli import goals
|
|
||||||
|
|
||||||
mgr = self._make_decomposed_mgr("phase-b-2", ["a", "b"])
|
|
||||||
with patch.object(
|
|
||||||
goals, "evaluate_checklist",
|
|
||||||
return_value=(
|
|
||||||
{
|
|
||||||
"updates": [
|
|
||||||
{"index": 0, "status": "completed", "evidence": "ok"},
|
|
||||||
{"index": 1, "status": "impossible", "evidence": "blocked"},
|
|
||||||
],
|
|
||||||
"new_items": [],
|
|
||||||
"reason": "all done or blocked",
|
|
||||||
},
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
d = mgr.evaluate_after_turn("response")
|
|
||||||
|
|
||||||
assert d["verdict"] == "done"
|
|
||||||
assert d["should_continue"] is False
|
|
||||||
assert mgr.state.status == "done"
|
|
||||||
|
|
||||||
def test_stickiness_judge_cannot_regress_completed(self, hermes_home):
|
|
||||||
"""Once an item is completed, judge updates trying to flip it back are ignored."""
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import ITEM_COMPLETED
|
|
||||||
|
|
||||||
mgr = self._make_decomposed_mgr("phase-b-stick", ["a"])
|
|
||||||
# First turn completes item 0.
|
|
||||||
with patch.object(
|
|
||||||
goals, "evaluate_checklist",
|
|
||||||
return_value=(
|
|
||||||
{
|
|
||||||
"updates": [{"index": 0, "status": "completed", "evidence": "yes"}],
|
|
||||||
"new_items": [],
|
|
||||||
"reason": "done",
|
|
||||||
},
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
mgr.evaluate_after_turn("turn 1")
|
|
||||||
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
||||||
# Second turn: judge tries to send a non-terminal update.
|
|
||||||
# _parse_evaluate_response already filters non-terminal, but at the
|
|
||||||
# apply layer we also skip terminal items entirely. Smoke both.
|
|
||||||
with patch.object(
|
|
||||||
goals, "evaluate_checklist",
|
|
||||||
return_value=(
|
|
||||||
{
|
|
||||||
"updates": [{"index": 0, "status": "impossible", "evidence": "regress"}],
|
|
||||||
"new_items": [],
|
|
||||||
"reason": "trying to regress",
|
|
||||||
},
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
mgr.evaluate_after_turn("turn 2")
|
|
||||||
# Sticky: status stays completed, evidence unchanged.
|
|
||||||
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
||||||
assert mgr.state.checklist[0].evidence == "yes"
|
|
||||||
|
|
||||||
def test_judge_appends_new_items(self, hermes_home):
|
|
||||||
from hermes_cli import goals
|
|
||||||
|
|
||||||
mgr = self._make_decomposed_mgr("phase-b-new", ["a"])
|
|
||||||
with patch.object(
|
|
||||||
goals, "evaluate_checklist",
|
|
||||||
return_value=(
|
|
||||||
{
|
|
||||||
"updates": [],
|
|
||||||
"new_items": [{"text": "newly discovered"}, {"text": "also this"}],
|
|
||||||
"reason": "found more work",
|
|
||||||
},
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
mgr.evaluate_after_turn("response")
|
|
||||||
assert len(mgr.state.checklist) == 3
|
|
||||||
assert mgr.state.checklist[1].text == "newly discovered"
|
|
||||||
assert mgr.state.checklist[1].added_by == "judge"
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
# /subgoal user controls
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class TestSubgoalUserControls:
|
|
||||||
def test_add_subgoal_appends_user_item(self, hermes_home):
|
|
||||||
from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_USER
|
|
||||||
|
|
||||||
mgr = GoalManager(session_id="user-sid-1")
|
|
||||||
mgr.set("g")
|
|
||||||
item = mgr.add_subgoal("user added")
|
|
||||||
assert item.text == "user added"
|
|
||||||
assert item.status == ITEM_PENDING
|
|
||||||
assert item.added_by == ADDED_BY_USER
|
|
||||||
assert len(mgr.state.checklist) == 1
|
|
||||||
|
|
||||||
def test_add_subgoal_requires_active_goal(self, hermes_home):
|
|
||||||
from hermes_cli.goals import GoalManager
|
|
||||||
mgr = GoalManager(session_id="user-sid-2")
|
|
||||||
with pytest.raises(RuntimeError):
|
|
||||||
mgr.add_subgoal("x")
|
|
||||||
|
|
||||||
def test_add_subgoal_rejects_empty_text(self, hermes_home):
|
|
||||||
from hermes_cli.goals import GoalManager
|
|
||||||
mgr = GoalManager(session_id="user-sid-3")
|
|
||||||
mgr.set("g")
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
mgr.add_subgoal(" ")
|
|
||||||
|
|
||||||
def test_mark_subgoal_uses_1_based_index(self, hermes_home):
|
|
||||||
from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_IMPOSSIBLE
|
|
||||||
mgr = GoalManager(session_id="user-sid-4")
|
|
||||||
mgr.set("g")
|
|
||||||
mgr.add_subgoal("a")
|
|
||||||
mgr.add_subgoal("b")
|
|
||||||
mgr.add_subgoal("c")
|
|
||||||
mgr.mark_subgoal(2, "completed")
|
|
||||||
mgr.mark_subgoal(3, "impossible")
|
|
||||||
assert mgr.state.checklist[0].status == "pending"
|
|
||||||
assert mgr.state.checklist[1].status == ITEM_COMPLETED
|
|
||||||
assert mgr.state.checklist[2].status == ITEM_IMPOSSIBLE
|
|
||||||
|
|
||||||
def test_mark_subgoal_rejects_invalid_index(self, hermes_home):
|
|
||||||
from hermes_cli.goals import GoalManager
|
|
||||||
mgr = GoalManager(session_id="user-sid-5")
|
|
||||||
mgr.set("g")
|
|
||||||
mgr.add_subgoal("a")
|
|
||||||
with pytest.raises(IndexError):
|
|
||||||
mgr.mark_subgoal(5, "completed")
|
|
||||||
with pytest.raises(IndexError):
|
|
||||||
mgr.mark_subgoal(0, "completed")
|
|
||||||
|
|
||||||
def test_user_can_revert_terminal_item(self, hermes_home):
|
|
||||||
"""User mark_subgoal bypasses stickiness — only path to revert."""
|
|
||||||
from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_PENDING
|
|
||||||
mgr = GoalManager(session_id="user-sid-6")
|
|
||||||
mgr.set("g")
|
|
||||||
mgr.add_subgoal("a")
|
|
||||||
mgr.mark_subgoal(1, "completed")
|
|
||||||
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
||||||
mgr.mark_subgoal(1, "pending")
|
|
||||||
assert mgr.state.checklist[0].status == ITEM_PENDING
|
|
||||||
|
|
||||||
def test_remove_subgoal(self, hermes_home):
|
|
||||||
from hermes_cli.goals import GoalManager
|
|
||||||
mgr = GoalManager(session_id="user-sid-7")
|
|
||||||
mgr.set("g")
|
|
||||||
mgr.add_subgoal("a")
|
|
||||||
mgr.add_subgoal("b")
|
|
||||||
mgr.add_subgoal("c")
|
|
||||||
removed = mgr.remove_subgoal(2)
|
|
||||||
assert removed.text == "b"
|
|
||||||
assert [it.text for it in mgr.state.checklist] == ["a", "c"]
|
|
||||||
|
|
||||||
def test_clear_checklist_resets_decomposed(self, hermes_home):
|
|
||||||
from hermes_cli.goals import GoalManager
|
|
||||||
mgr = GoalManager(session_id="user-sid-8")
|
|
||||||
mgr.set("g")
|
|
||||||
mgr.state.decomposed = True
|
|
||||||
mgr.add_subgoal("a")
|
|
||||||
mgr.clear_checklist()
|
|
||||||
assert mgr.state.checklist == []
|
|
||||||
assert mgr.state.decomposed is False
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
# Conversation dump
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class TestConversationDump:
|
|
||||||
def test_dump_writes_messages_to_goals_dir(self, hermes_home):
|
|
||||||
from hermes_cli.goals import dump_conversation, conversation_dump_path
|
|
||||||
|
|
||||||
msgs = [
|
|
||||||
{"role": "user", "content": "hi"},
|
|
||||||
{"role": "assistant", "content": "hello"},
|
|
||||||
]
|
|
||||||
path = dump_conversation("dump-sid-1", msgs)
|
|
||||||
assert path is not None
|
|
||||||
assert path.exists()
|
|
||||||
# Path is under <HERMES_HOME>/goals/<sid>.json
|
|
||||||
assert path.parent.name == "goals"
|
|
||||||
assert path.name == "dump-sid-1.json"
|
|
||||||
|
|
||||||
loaded = json.loads(path.read_text())
|
|
||||||
assert loaded == msgs
|
|
||||||
|
|
||||||
# conversation_dump_path returns the same path
|
|
||||||
assert conversation_dump_path("dump-sid-1") == path
|
|
||||||
|
|
||||||
def test_dump_handles_unsafe_session_id(self, hermes_home):
|
|
||||||
from hermes_cli.goals import dump_conversation
|
|
||||||
|
|
||||||
path = dump_conversation("evil/../../sid", [{"role": "user", "content": "x"}])
|
|
||||||
assert path is not None
|
|
||||||
# No traversal — slashes are normalized to underscores. (Periods are
|
|
||||||
# preserved because they're legitimate in filenames; the resulting
|
|
||||||
# name still cannot escape <HERMES_HOME>/goals/ since path
|
|
||||||
# separators are gone.)
|
|
||||||
assert "/" not in path.name
|
|
||||||
assert path.parent.name == "goals"
|
|
||||||
# Verify the resolved path stays under the goals dir.
|
|
||||||
from hermes_cli.goals import _goals_dump_dir
|
|
||||||
goals_dir = _goals_dump_dir().resolve()
|
|
||||||
assert str(path.resolve()).startswith(str(goals_dir))
|
|
||||||
|
|
||||||
def test_dump_skips_when_messages_empty(self, hermes_home):
|
|
||||||
from hermes_cli.goals import dump_conversation
|
|
||||||
assert dump_conversation("sid", []) is None
|
|
||||||
assert dump_conversation("", [{"role": "user", "content": "x"}]) is None
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
# Judge read_file tool: path restriction
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class TestJudgeReadFile:
|
|
||||||
def test_restricted_to_allowed_path(self, hermes_home, tmp_path):
|
|
||||||
from hermes_cli.goals import _judge_read_file
|
|
||||||
|
|
||||||
allowed = tmp_path / "allowed.json"
|
|
||||||
allowed.write_text("hello\nworld\n")
|
|
||||||
|
|
||||||
ok = _judge_read_file(str(allowed), allowed_path=allowed)
|
|
||||||
loaded = json.loads(ok)
|
|
||||||
assert loaded["content"].startswith("hello")
|
|
||||||
|
|
||||||
# Try to read a different file.
|
|
||||||
sneaky = tmp_path / "secret.txt"
|
|
||||||
sneaky.write_text("nope\n")
|
|
||||||
denied = _judge_read_file(str(sneaky), allowed_path=allowed)
|
|
||||||
loaded = json.loads(denied)
|
|
||||||
assert "error" in loaded
|
|
||||||
assert "restricted" in loaded["error"]
|
|
||||||
|
|
||||||
def test_pagination(self, hermes_home, tmp_path):
|
|
||||||
from hermes_cli.goals import _judge_read_file
|
|
||||||
f = tmp_path / "big.json"
|
|
||||||
f.write_text("\n".join(f"line-{i}" for i in range(50)) + "\n")
|
|
||||||
|
|
||||||
# offset=10, limit=5 should return lines 10..14.
|
|
||||||
result = json.loads(_judge_read_file(str(f), offset=10, limit=5, allowed_path=f))
|
|
||||||
assert result["returned"] == 5
|
|
||||||
assert "line-9" in result["content"] # 1-based: line 10 == zero-indexed 9
|
|
||||||
assert result["next_offset"] == 15
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
# Index conversion: judge emits 1-based, apply layer uses 0-based
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class TestJudgeIndexConversion:
|
|
||||||
def test_parse_evaluate_converts_1based_to_0based(self):
|
|
||||||
"""The judge sees the checklist with 1-based indices (rendered as
|
|
||||||
'1. [ ] foo, 2. [ ] bar'). It emits updates with those same indices.
|
|
||||||
``_parse_evaluate_response`` must convert them to 0-based so the
|
|
||||||
apply layer can index ``state.checklist`` directly.
|
|
||||||
"""
|
|
||||||
from hermes_cli.goals import _parse_evaluate_response
|
|
||||||
|
|
||||||
raw = '''
|
|
||||||
{"updates": [
|
|
||||||
{"index": 1, "status": "completed", "evidence": "first item"},
|
|
||||||
{"index": 3, "status": "impossible", "evidence": "third item"}
|
|
||||||
],
|
|
||||||
"new_items": [],
|
|
||||||
"reason": "evaluated"}
|
|
||||||
'''
|
|
||||||
parsed, parse_failed = _parse_evaluate_response(raw)
|
|
||||||
assert parse_failed is False
|
|
||||||
# 1 → 0, 3 → 2
|
|
||||||
assert [u["index"] for u in parsed["updates"]] == [0, 2]
|
|
||||||
assert parsed["updates"][0]["evidence"] == "first item"
|
|
||||||
assert parsed["updates"][1]["status"] == "impossible"
|
|
||||||
|
|
||||||
def test_full_round_trip_judge_index_to_state(self, hermes_home):
|
|
||||||
"""End-to-end: judge emits 1-based, parser converts, apply layer
|
|
||||||
flips the right items in state.checklist."""
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import (
|
|
||||||
GoalManager, ChecklistItem, ITEM_PENDING, ITEM_COMPLETED,
|
|
||||||
ADDED_BY_JUDGE,
|
|
||||||
)
|
|
||||||
|
|
||||||
mgr = GoalManager(session_id="idx-round-trip")
|
|
||||||
mgr.set("g")
|
|
||||||
mgr.state.decomposed = True
|
|
||||||
mgr.state.checklist = [
|
|
||||||
ChecklistItem(text="first", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
|
|
||||||
ChecklistItem(text="second", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
|
|
||||||
ChecklistItem(text="third", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
|
|
||||||
]
|
|
||||||
goals.save_goal("idx-round-trip", mgr.state)
|
|
||||||
|
|
||||||
# Simulate the judge returning a raw-JSON Phase-B reply via the
|
|
||||||
# auxiliary client: the parser handles the 1-based → 0-based
|
|
||||||
# conversion so the apply layer flips item 1 (text="first").
|
|
||||||
class FakeMessage:
|
|
||||||
content = '''
|
|
||||||
{"updates": [{"index": 1, "status": "completed", "evidence": "first done"}],
|
|
||||||
"new_items": [],
|
|
||||||
"reason": "..."}
|
|
||||||
'''
|
|
||||||
tool_calls = None
|
|
||||||
|
|
||||||
class FakeChoice:
|
|
||||||
message = FakeMessage()
|
|
||||||
|
|
||||||
class FakeResponse:
|
|
||||||
choices = [FakeChoice()]
|
|
||||||
|
|
||||||
class FakeClient:
|
|
||||||
class chat:
|
|
||||||
class completions:
|
|
||||||
@staticmethod
|
|
||||||
def create(**kwargs):
|
|
||||||
return FakeResponse()
|
|
||||||
|
|
||||||
with patch.object(goals, "_get_judge_client", return_value=(FakeClient, "fake-model")):
|
|
||||||
mgr.evaluate_after_turn("ran the script and item 1 is done")
|
|
||||||
|
|
||||||
# Item 1 (text="first") should now be completed.
|
|
||||||
assert mgr.state.checklist[0].text == "first"
|
|
||||||
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
||||||
assert mgr.state.checklist[0].evidence == "first done"
|
|
||||||
# Other items still pending.
|
|
||||||
assert mgr.state.checklist[1].status == ITEM_PENDING
|
|
||||||
assert mgr.state.checklist[2].status == ITEM_PENDING
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
# Compression session-rotation: goal must follow the new session_id
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class TestGoalSurvivesCompressionRotation:
|
|
||||||
def test_load_goal_after_session_id_rotates(self, hermes_home):
|
|
||||||
"""When auto-compression rotates the session_id, the goal must be
|
|
||||||
readable from the new session_id (forwarded by run_agent's
|
|
||||||
_compress_context block).
|
|
||||||
|
|
||||||
We don't run the full _compress_context method here — it has
|
|
||||||
~60 dependencies. Instead we mirror exactly what that block does
|
|
||||||
with state_meta and assert the goal manager picks it up.
|
|
||||||
"""
|
|
||||||
from hermes_cli.goals import GoalManager
|
|
||||||
from hermes_state import SessionDB
|
|
||||||
|
|
||||||
# Create a goal under a parent session_id.
|
|
||||||
parent_sid = "parent-rotate-001"
|
|
||||||
mgr = GoalManager(session_id=parent_sid)
|
|
||||||
mgr.set("survive compression")
|
|
||||||
assert mgr.is_active()
|
|
||||||
|
|
||||||
# Simulate the run_agent._compress_context forwarding block:
|
|
||||||
# read goal:<old>, write goal:<new> on the same SessionDB instance.
|
|
||||||
db = SessionDB()
|
|
||||||
new_sid = "child-rotate-001"
|
|
||||||
blob = db.get_meta(f"goal:{parent_sid}")
|
|
||||||
assert blob, "goal must be in state_meta"
|
|
||||||
db.set_meta(f"goal:{new_sid}", blob)
|
|
||||||
|
|
||||||
# New GoalManager for the rotated session_id should load the same goal.
|
|
||||||
mgr2 = GoalManager(session_id=new_sid)
|
|
||||||
assert mgr2.is_active()
|
|
||||||
assert mgr2.state.goal == "survive compression"
|
|
||||||
# Counters/checklist preserved verbatim.
|
|
||||||
assert mgr2.state.turns_used == mgr.state.turns_used
|
|
||||||
assert mgr2.state.checklist == mgr.state.checklist
|
|
||||||
|
|
||||||
def test_no_forward_when_no_goal(self, hermes_home):
|
|
||||||
"""Forwarding is a no-op when the parent session has no goal."""
|
|
||||||
from hermes_state import SessionDB
|
|
||||||
from hermes_cli.goals import load_goal
|
|
||||||
|
|
||||||
db = SessionDB()
|
|
||||||
# Parent has no goal at all.
|
|
||||||
assert db.get_meta("goal:parent-no-goal") is None
|
|
||||||
blob = db.get_meta("goal:parent-no-goal")
|
|
||||||
if blob: # parity with production guard
|
|
||||||
db.set_meta("goal:child-no-goal", blob)
|
|
||||||
|
|
||||||
# Child should still have no goal.
|
|
||||||
assert load_goal("child-no-goal") is None
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
# Forced tool-call judge: submit_checklist (Phase A) + update_checklist (Phase B)
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class _FakeFn:
|
|
||||||
def __init__(self, name, args):
|
|
||||||
self.name = name
|
|
||||||
self.arguments = args if isinstance(args, str) else json.dumps(args)
|
|
||||||
|
|
||||||
|
|
||||||
class _FakeToolCall:
|
|
||||||
def __init__(self, tc_id, name, args):
|
|
||||||
self.id = tc_id
|
|
||||||
self.type = "function"
|
|
||||||
self.function = _FakeFn(name, args)
|
|
||||||
|
|
||||||
|
|
||||||
class _FakeMessage:
|
|
||||||
def __init__(self, *, content="", tool_calls=None):
|
|
||||||
self.content = content
|
|
||||||
self.tool_calls = tool_calls or []
|
|
||||||
|
|
||||||
|
|
||||||
class _FakeChoice:
|
|
||||||
def __init__(self, message):
|
|
||||||
self.message = message
|
|
||||||
|
|
||||||
|
|
||||||
class _FakeResponse:
|
|
||||||
def __init__(self, message):
|
|
||||||
self.choices = [_FakeChoice(message)]
|
|
||||||
|
|
||||||
|
|
||||||
def _make_fake_client(scripted_messages):
|
|
||||||
"""Return a fake client whose .chat.completions.create() returns the
|
|
||||||
next scripted message each call. Mutates the underlying list as a
|
|
||||||
queue so repeat calls advance.
|
|
||||||
"""
|
|
||||||
class FakeClient:
|
|
||||||
class chat:
|
|
||||||
class completions:
|
|
||||||
_queue = list(scripted_messages)
|
|
||||||
_calls = []
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create(cls, **kwargs):
|
|
||||||
cls._calls.append(kwargs)
|
|
||||||
if not cls._queue:
|
|
||||||
raise RuntimeError("scripted-message queue exhausted")
|
|
||||||
return _FakeResponse(cls._queue.pop(0))
|
|
||||||
|
|
||||||
return FakeClient
|
|
||||||
|
|
||||||
|
|
||||||
class TestPhaseAToolCall:
|
|
||||||
def test_decompose_via_submit_checklist_tool(self, hermes_home):
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import decompose_goal
|
|
||||||
|
|
||||||
msg = _FakeMessage(
|
|
||||||
tool_calls=[_FakeToolCall(
|
|
||||||
"tc-1", "submit_checklist",
|
|
||||||
{"items": [{"text": "first criterion"}, {"text": "second criterion"}]},
|
|
||||||
)],
|
|
||||||
)
|
|
||||||
client = _make_fake_client([msg])
|
|
||||||
|
|
||||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
|
||||||
items, err = decompose_goal("build a website")
|
|
||||||
|
|
||||||
assert err is None
|
|
||||||
assert [it["text"] for it in items] == ["first criterion", "second criterion"]
|
|
||||||
# Verify we forced the tool: tool_choice should target submit_checklist.
|
|
||||||
call = client.chat.completions._calls[0]
|
|
||||||
assert "tools" in call
|
|
||||||
assert call["tools"][0]["function"]["name"] == "submit_checklist"
|
|
||||||
# tool_choice should be either {"type":"function","function":{"name":"submit_checklist"}}
|
|
||||||
# or "required" / "auto" if a fallback was used; primary attempt forces it.
|
|
||||||
tc = call["tool_choice"]
|
|
||||||
assert (
|
|
||||||
(isinstance(tc, dict) and tc.get("function", {}).get("name") == "submit_checklist")
|
|
||||||
or tc == "required"
|
|
||||||
or tc == "auto"
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_decompose_falls_back_to_json_content_when_no_tool_call(self, hermes_home):
|
|
||||||
"""If a broken provider returns content instead of a tool call, the
|
|
||||||
backstop JSON parser still salvages a checklist."""
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import decompose_goal
|
|
||||||
|
|
||||||
msg = _FakeMessage(
|
|
||||||
content='{"checklist": [{"text": "salvaged"}]}',
|
|
||||||
tool_calls=[],
|
|
||||||
)
|
|
||||||
client = _make_fake_client([msg])
|
|
||||||
|
|
||||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
|
||||||
items, err = decompose_goal("g")
|
|
||||||
|
|
||||||
assert err is None
|
|
||||||
assert items == [{"text": "salvaged"}]
|
|
||||||
|
|
||||||
def test_decompose_returns_error_when_no_tool_and_no_json(self, hermes_home):
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import decompose_goal
|
|
||||||
|
|
||||||
msg = _FakeMessage(content="I think this should be done in stages.", tool_calls=[])
|
|
||||||
client = _make_fake_client([msg])
|
|
||||||
|
|
||||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
|
||||||
items, err = decompose_goal("g")
|
|
||||||
|
|
||||||
assert items == []
|
|
||||||
assert err and "submit_checklist" in err
|
|
||||||
|
|
||||||
def test_decompose_drops_empty_text_items(self, hermes_home):
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import decompose_goal
|
|
||||||
|
|
||||||
msg = _FakeMessage(
|
|
||||||
tool_calls=[_FakeToolCall(
|
|
||||||
"tc-1", "submit_checklist",
|
|
||||||
{"items": [{"text": "ok"}, {"text": ""}, {"text": " "}, {"text": "two"}]},
|
|
||||||
)],
|
|
||||||
)
|
|
||||||
client = _make_fake_client([msg])
|
|
||||||
|
|
||||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
|
||||||
items, err = decompose_goal("g")
|
|
||||||
|
|
||||||
assert err is None
|
|
||||||
assert [it["text"] for it in items] == ["ok", "two"]
|
|
||||||
|
|
||||||
|
|
||||||
class TestPhaseBToolCall:
|
|
||||||
def test_evaluate_via_update_checklist_tool(self, hermes_home):
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
|
|
||||||
|
|
||||||
state = GoalState(
|
|
||||||
goal="g",
|
|
||||||
decomposed=True,
|
|
||||||
checklist=[
|
|
||||||
ChecklistItem(text="a", status=ITEM_PENDING),
|
|
||||||
ChecklistItem(text="b", status=ITEM_PENDING),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
msg = _FakeMessage(
|
|
||||||
tool_calls=[_FakeToolCall(
|
|
||||||
"tc-1", "update_checklist",
|
|
||||||
{
|
|
||||||
# 1-based indices; layer converts to 0-based.
|
|
||||||
"updates": [{"index": 1, "status": "completed", "evidence": "did a"}],
|
|
||||||
"new_items": [{"text": "discovered c"}],
|
|
||||||
"reason": "ran a",
|
|
||||||
},
|
|
||||||
)],
|
|
||||||
)
|
|
||||||
client = _make_fake_client([msg])
|
|
||||||
|
|
||||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
|
||||||
parsed, parse_failed = evaluate_checklist(
|
|
||||||
state, "did the first thing", history_path=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert parse_failed is False
|
|
||||||
# Index converted 1 → 0
|
|
||||||
assert parsed["updates"] == [{"index": 0, "status": "completed", "evidence": "did a"}]
|
|
||||||
assert parsed["new_items"] == [{"text": "discovered c"}]
|
|
||||||
assert parsed["reason"] == "ran a"
|
|
||||||
|
|
||||||
def test_evaluate_does_read_file_then_update(self, hermes_home, tmp_path):
|
|
||||||
"""Phase-B tool loop: judge calls read_file once, then update_checklist."""
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
|
|
||||||
|
|
||||||
# Make a real history file so the path-restriction check passes.
|
|
||||||
hist = tmp_path / "hist.json"
|
|
||||||
hist.write_text(json.dumps([{"role": "user", "content": "hi"}]))
|
|
||||||
|
|
||||||
state = GoalState(
|
|
||||||
goal="g",
|
|
||||||
decomposed=True,
|
|
||||||
checklist=[ChecklistItem(text="a", status=ITEM_PENDING)],
|
|
||||||
)
|
|
||||||
|
|
||||||
msg1 = _FakeMessage(tool_calls=[_FakeToolCall(
|
|
||||||
"tc-1", "read_file", {"path": str(hist), "offset": 1, "limit": 100},
|
|
||||||
)])
|
|
||||||
msg2 = _FakeMessage(tool_calls=[_FakeToolCall(
|
|
||||||
"tc-2", "update_checklist",
|
|
||||||
{
|
|
||||||
"updates": [{"index": 1, "status": "completed", "evidence": "saw it"}],
|
|
||||||
"new_items": [],
|
|
||||||
"reason": "verified via read_file",
|
|
||||||
},
|
|
||||||
)])
|
|
||||||
client = _make_fake_client([msg1, msg2])
|
|
||||||
|
|
||||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
|
||||||
parsed, parse_failed = evaluate_checklist(
|
|
||||||
state, "did the thing", history_path=hist,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert parse_failed is False
|
|
||||||
assert parsed["updates"][0]["status"] == "completed"
|
|
||||||
assert parsed["reason"] == "verified via read_file"
|
|
||||||
# Two API calls — one for the read, one for the verdict.
|
|
||||||
assert len(client.chat.completions._calls) == 2
|
|
||||||
|
|
||||||
def test_evaluate_filters_non_terminal_status_in_tool_args(self, hermes_home):
|
|
||||||
"""update_checklist should only accept 'completed' or 'impossible' —
|
|
||||||
any 'pending' updates are dropped at the normalize layer."""
|
|
||||||
from hermes_cli import goals
|
|
||||||
from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
|
|
||||||
|
|
||||||
state = GoalState(
|
|
||||||
goal="g",
|
|
||||||
decomposed=True,
|
|
||||||
checklist=[
|
|
||||||
ChecklistItem(text="a", status=ITEM_PENDING),
|
|
||||||
ChecklistItem(text="b", status=ITEM_PENDING),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
msg = _FakeMessage(tool_calls=[_FakeToolCall(
|
|
||||||
"tc-1", "update_checklist",
|
|
||||||
{
|
|
||||||
"updates": [
|
|
||||||
{"index": 1, "status": "completed", "evidence": "yes"},
|
|
||||||
{"index": 2, "status": "pending", "evidence": "skip me"},
|
|
||||||
],
|
|
||||||
"new_items": [],
|
|
||||||
"reason": "...",
|
|
||||||
},
|
|
||||||
)])
|
|
||||||
client = _make_fake_client([msg])
|
|
||||||
|
|
||||||
with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
|
|
||||||
parsed, _pf = evaluate_checklist(state, "x", history_path=None)
|
|
||||||
|
|
||||||
# Only the completed flip survives; pending update is dropped silently.
|
|
||||||
assert len(parsed["updates"]) == 1
|
|
||||||
assert parsed["updates"][0]["index"] == 0
|
|
||||||
|
|
|
||||||
|
|
@ -3251,7 +3251,6 @@ def _run_prompt_submit(rid, sid: str, session: dict, text: Any) -> None:
|
||||||
decision = goal_mgr.evaluate_after_turn(
|
decision = goal_mgr.evaluate_after_turn(
|
||||||
raw,
|
raw,
|
||||||
user_initiated=True,
|
user_initiated=True,
|
||||||
messages=list(session.get("history") or []),
|
|
||||||
)
|
)
|
||||||
verdict_msg = decision.get("message") or ""
|
verdict_msg = decision.get("message") or ""
|
||||||
if verdict_msg:
|
if verdict_msg:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue