From 3e7145e0bbcded852a5324ceb549fe5ca94ac924 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 11 May 2026 07:06:27 -0700 Subject: [PATCH] revert: roll back /goal checklist + /subgoal feature stack (#23813) * Revert "fix(goals): force judge to use tool calls instead of JSON-text replies (#23547)" This reverts commit a63a2b7c78562cd4eaf33f5f7db81ae0b3938552. * Revert "fix(goals): forward standing /goal state on auto-compression session rotation (#23530)" This reverts commit 4a080b1d5aa7528a679880c93147bc7fffdd267a. * Revert "feat(goals): /goal checklist + /subgoal user controls (#23456)" This reverts commit 404640a2b752f502825dc8b26212204fa890d495. --- cli.py | 105 +- gateway/run.py | 101 +- hermes_cli/commands.py | 2 - hermes_cli/goals.py | 1442 ++--------------------- run_agent.py | 19 - tests/cli/test_cli_goal_interrupt.py | 17 +- tests/gateway/test_goal_verdict_send.py | 14 +- tests/hermes_cli/test_goals.py | 899 +------------- tui_gateway/server.py | 1 - 9 files changed, 113 insertions(+), 2487 deletions(-) diff --git a/cli.py b/cli.py index fd9cc275e8a..bb1bdb61764 100644 --- a/cli.py +++ b/cli.py @@ -7252,8 +7252,6 @@ class HermesCLI: _cprint(f" No agent running; queued as next turn: {payload[:80]}{'...' if len(payload) > 80 else ''}") elif canonical == "goal": self._handle_goal_command(cmd_original) - elif canonical == "subgoal": - self._handle_subgoal_command(cmd_original) elif canonical == "skin": self._handle_skin_command(cmd_original) elif canonical == "voice": @@ -7850,103 +7848,6 @@ class HermesCLI: except Exception: pass - def _handle_subgoal_command(self, cmd: str) -> None: - """Dispatch /subgoal subcommands. - - Forms: - /subgoal show the checklist - /subgoal append a user item - /subgoal complete mark item n completed - /subgoal impossible mark item n impossible - /subgoal undo revert item n to pending - /subgoal remove delete item n - /subgoal clear wipe the checklist (judge re-decomposes) - """ - parts = (cmd or "").strip().split(None, 2) - # parts[0] == "/subgoal"; remainder is what the user typed - arg = " ".join(parts[1:]).strip() if len(parts) > 1 else "" - - mgr = self._get_goal_manager() - if mgr is None: - _cprint(f" {_DIM}Goals unavailable (no active session).{_RST}") - return - - if not mgr.has_goal(): - _cprint(f" {_DIM}No active goal. Set one with /goal .{_RST}") - return - - # No args → show the checklist. - if not arg: - _cprint(f" {mgr.status_line()}") - _cprint(f" {mgr.render_checklist()}") - return - - tokens = arg.split(None, 1) - verb = tokens[0].lower() - rest = tokens[1].strip() if len(tokens) > 1 else "" - - # Action verbs operate on indices. - action_status_map = { - "complete": "completed", - "completed": "completed", - "done": "completed", - "impossible": "impossible", - "imp": "impossible", - "skip": "impossible", - "undo": "pending", - "pending": "pending", - "reset": "pending", - } - if verb in action_status_map: - if not rest: - _cprint(f" Usage: /subgoal {verb} ") - return - try: - idx = int(rest.split()[0]) - except ValueError: - _cprint(f" /subgoal {verb}: must be an integer (1-based index).") - return - try: - item = mgr.mark_subgoal(idx, action_status_map[verb]) - except (IndexError, ValueError, RuntimeError) as exc: - _cprint(f" /subgoal {verb}: {exc}") - return - _cprint(f" ✓ Item {idx} → {item.status}: {item.text}") - return - - if verb == "remove": - if not rest: - _cprint(" Usage: /subgoal remove ") - return - try: - idx = int(rest.split()[0]) - except ValueError: - _cprint(" /subgoal remove: must be an integer (1-based index).") - return - try: - removed = mgr.remove_subgoal(idx) - except (IndexError, RuntimeError) as exc: - _cprint(f" /subgoal remove: {exc}") - return - _cprint(f" ✓ Removed item {idx}: {removed.text}") - return - - if verb == "clear": - mgr.clear_checklist() - _cprint( - " ✓ Checklist cleared. The judge will re-decompose on the next turn." - ) - return - - # Otherwise: append `arg` as a user-authored checklist item. - try: - item = mgr.add_subgoal(arg) - except (ValueError, RuntimeError) as exc: - _cprint(f" /subgoal: {exc}") - return - idx = len(mgr.state.checklist) if mgr.state else 0 - _cprint(f" ✓ Added subgoal {idx}: {item.text}") - def _maybe_continue_goal_after_turn(self) -> None: """Hook run after every CLI turn. Judges + maybe re-queues. @@ -8024,11 +7925,7 @@ class HermesCLI: if not last_response.strip(): return - decision = mgr.evaluate_after_turn( - last_response, - user_initiated=True, - messages=getattr(self, "conversation_history", None) or [], - ) + decision = mgr.evaluate_after_turn(last_response, user_initiated=True) msg = decision.get("message") or "" if msg: _cprint(f" {msg}") diff --git a/gateway/run.py b/gateway/run.py index 7b2d7dea333..6c652e7cf10 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -6083,12 +6083,6 @@ class GatewayRunner: return await self._handle_goal_command(event) return "Agent is running — use /goal status / pause / clear mid-run, or /stop before setting a new goal." - # /subgoal is safe mid-run — it only modifies the active goal's - # checklist, which the judge consults at turn boundaries. There - # is no race with the running turn. - if _cmd_def_inner and _cmd_def_inner.name == "subgoal": - return await self._handle_subgoal_command(event) - # Session-level toggles that are safe to run mid-agent — # /yolo can unblock a pending approval prompt, /verbose cycles # the tool-progress display mode for the ongoing stream. @@ -6467,9 +6461,6 @@ class GatewayRunner: if canonical == "goal": return await self._handle_goal_command(event) - if canonical == "subgoal": - return await self._handle_subgoal_command(event) - if canonical == "voice": return await self._handle_voice_command(event) @@ -6654,18 +6645,10 @@ class GatewayRunner: except Exception: session_entry = None if session_entry is not None: - # Pull the agent's full messages list from the result - # so the judge can dump it for its read_file tool. - _agent_messages: list = [] - if isinstance(_agent_result, dict): - _msgs = _agent_result.get("messages") - if isinstance(_msgs, list): - _agent_messages = _msgs await self._post_turn_goal_continuation( session_entry=session_entry, source=source, final_response=_final_text, - agent_messages=_agent_messages, ) except Exception as _goal_exc: logger.debug("goal continuation hook failed: %s", _goal_exc) @@ -9402,83 +9385,6 @@ class GatewayRunner: return t("gateway.goal.set", budget=state.max_turns, goal=state.goal) - async def _handle_subgoal_command(self, event: "MessageEvent") -> str: - """Handle /subgoal for gateway platforms. - - Forms (mirror of CLI): - /subgoal show the checklist - /subgoal append a user item - /subgoal complete | done mark item n completed - /subgoal impossible mark item n impossible - /subgoal undo revert item n to pending - /subgoal remove delete item n - /subgoal clear wipe the checklist - """ - args = (event.get_command_args() or "").strip() - - mgr, _session_entry = self._get_goal_manager_for_event(event) - if mgr is None: - return t("gateway.goal.unavailable") - - if not mgr.has_goal(): - return "No active goal. Set one with /goal ." - - if not args: - return f"{mgr.status_line()}\n{mgr.render_checklist()}" - - tokens = args.split(None, 1) - verb = tokens[0].lower() - rest = tokens[1].strip() if len(tokens) > 1 else "" - - action_status_map = { - "complete": "completed", - "completed": "completed", - "done": "completed", - "impossible": "impossible", - "imp": "impossible", - "skip": "impossible", - "undo": "pending", - "pending": "pending", - "reset": "pending", - } - if verb in action_status_map: - if not rest: - return f"Usage: /subgoal {verb} " - try: - idx = int(rest.split()[0]) - except ValueError: - return f"/subgoal {verb}: must be an integer (1-based index)." - try: - item = mgr.mark_subgoal(idx, action_status_map[verb]) - except (IndexError, ValueError, RuntimeError) as exc: - return f"/subgoal {verb}: {exc}" - return f"✓ Item {idx} → {item.status}: {item.text}" - - if verb == "remove": - if not rest: - return "Usage: /subgoal remove " - try: - idx = int(rest.split()[0]) - except ValueError: - return "/subgoal remove: must be an integer (1-based index)." - try: - removed = mgr.remove_subgoal(idx) - except (IndexError, RuntimeError) as exc: - return f"/subgoal remove: {exc}" - return f"✓ Removed item {idx}: {removed.text}" - - if verb == "clear": - mgr.clear_checklist() - return "✓ Checklist cleared. The judge will re-decompose on the next turn." - - # Otherwise — append `args` as a new user-authored checklist item. - try: - item = mgr.add_subgoal(args) - except (ValueError, RuntimeError) as exc: - return f"/subgoal: {exc}" - idx = len(mgr.state.checklist) if mgr.state else 0 - return f"✓ Added subgoal {idx}: {item.text}" - async def _send_goal_status_notice(self, source: Any, message: str) -> None: """Send a /goal judge status line back to the originating chat/thread.""" adapter = self.adapters.get(source.platform) @@ -9547,7 +9453,6 @@ class GatewayRunner: session_entry: Any, source: Any, final_response: str, - agent_messages: Optional[list] = None, ) -> None: """Run the goal judge after a gateway turn and, if still active, enqueue a continuation prompt for the same session. @@ -9575,11 +9480,7 @@ class GatewayRunner: if not mgr.is_active(): return - decision = mgr.evaluate_after_turn( - final_response or "", - user_initiated=True, - messages=agent_messages or [], - ) + decision = mgr.evaluate_after_turn(final_response or "", user_initiated=True) msg = decision.get("message") or "" # Defer the status line until after the adapter has delivered the diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index e2727c25bab..3f543bba647 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -104,8 +104,6 @@ COMMAND_REGISTRY: list[CommandDef] = [ args_hint=""), CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session", args_hint="[text | pause | resume | clear | status]"), - CommandDef("subgoal", "Add or manage checklist items on the active goal", "Session", - args_hint="[text | complete N | impossible N | undo N | remove N | clear]"), CommandDef("status", "Show session info", "Session"), CommandDef("whoami", "Show your slash command access (admin / user)", "Info"), CommandDef("profile", "Show active profile name and home directory", "Info"), diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py index 3bd869296a8..894cdddb01b 100644 --- a/hermes_cli/goals.py +++ b/hermes_cli/goals.py @@ -7,18 +7,6 @@ continuation prompt back into the same session and keeps working until the goal is done, turn budget is exhausted, the user pauses/clears it, or the user sends a new message (which takes priority and pauses the goal loop). -Checklist mode (added 2026-05): when a goal is set, a Phase-A "decompose" -call asks the judge to write an extremely detailed checklist of concrete -completion criteria for that goal. On every subsequent turn (Phase B) the -judge evaluates the agent's most recent output against EACH pending item -and may flip pending → completed | impossible, or append new items it -discovers along the way. The goal is done only when every checklist item -is in a terminal status. This is much harsher than the freeform -"is the goal done?" prompt and gives users a visible, verifiable progress -surface via /subgoal. A bounded read_file tool loop lets the judge inspect -the dumped conversation history when the snippet alone isn't enough to -rule. - State is persisted in SessionDB's ``state_meta`` table keyed by ``goal:`` so ``/resume`` picks it up. @@ -33,9 +21,6 @@ Design notes / invariants: prompt and also pauses the goal loop for that turn (we still re-judge after, so if the user's message happens to complete the goal the judge will say ``done``). -- Stickiness: once an item is marked completed or impossible, only the user - (via /subgoal undo) can flip it back. Judge updates that try to regress - terminal items are silently ignored. - This module has zero hard dependency on ``cli.HermesCLI`` or the gateway runner — both wire the same ``GoalManager`` in. @@ -46,12 +31,10 @@ from __future__ import annotations import json import logging -import os import re import time -from dataclasses import dataclass, field, asdict -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from dataclasses import dataclass, asdict +from typing import Any, Dict, Optional, Tuple logger = logging.getLogger(__name__) @@ -61,9 +44,8 @@ logger = logging.getLogger(__name__) # ────────────────────────────────────────────────────────────────────── DEFAULT_MAX_TURNS = 20 -DEFAULT_JUDGE_TIMEOUT = 60.0 -# Cap how much of the last response we send to the judge inline. The judge -# can read the dumped conversation file via read_file if it needs more. +DEFAULT_JUDGE_TIMEOUT = 30.0 +# Cap how much of the last response + recent messages we send to the judge. _JUDGE_RESPONSE_SNIPPET_CHARS = 4000 # After this many consecutive judge *parse* failures (empty output / non-JSON), # the loop auto-pauses and points the user at the goal_judge config. API / @@ -73,36 +55,8 @@ _JUDGE_RESPONSE_SNIPPET_CHARS = 4000 # exhausted with every reply shaped like `judge returned empty response` or # `judge reply was not JSON`. DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3 -# Bound the Phase-B judge tool loop: if the judge keeps calling read_file -# without ever emitting a verdict, cap it so we don't burn the model's budget. -DEFAULT_MAX_JUDGE_TOOL_CALLS = 5 -# Cap a single read_file response so a judge that tries to read 100k lines -# doesn't blow up its own context. Judge can paginate if needed. -_JUDGE_READ_FILE_MAX_LINES = 400 -_JUDGE_READ_FILE_MAX_CHARS = 32_000 -# Status constants ──────────────────────────────────────────────────── -ITEM_PENDING = "pending" -ITEM_COMPLETED = "completed" -ITEM_IMPOSSIBLE = "impossible" -TERMINAL_ITEM_STATUSES = frozenset({ITEM_COMPLETED, ITEM_IMPOSSIBLE}) -VALID_ITEM_STATUSES = frozenset({ITEM_PENDING, ITEM_COMPLETED, ITEM_IMPOSSIBLE}) - -ITEM_MARKERS = { - ITEM_COMPLETED: "[x]", - ITEM_IMPOSSIBLE: "[!]", - ITEM_PENDING: "[ ]", -} - -ADDED_BY_JUDGE = "judge" -ADDED_BY_USER = "user" - - -# ────────────────────────────────────────────────────────────────────── -# Continuation prompt -# ────────────────────────────────────────────────────────────────────── - CONTINUATION_PROMPT_TEMPLATE = ( "[Continuing toward your standing goal]\n" "Goal: {goal}\n\n" @@ -111,57 +65,8 @@ CONTINUATION_PROMPT_TEMPLATE = ( "If you are blocked and need input from the user, say so clearly and stop." ) -CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE = ( - "[Continuing toward your standing goal]\n" - "Goal: {goal}\n\n" - "Checklist progress ({done}/{total} done):\n" - "{checklist}\n\n" - "Work on the unchecked items above. Do not declare items done yourself " - "— a judge marks them based on evidence in your output. If an item is " - "genuinely impossible in this environment, explain why so the judge can " - "mark it impossible. If you are blocked on a remaining item and need " - "user input, say so clearly and stop." -) - -# ────────────────────────────────────────────────────────────────────── -# Phase-A: decompose prompts -# ────────────────────────────────────────────────────────────────────── - -DECOMPOSE_SYSTEM_PROMPT = ( - "You are a strict judge for an autonomous agent. Your first job, before " - "judging anything, is to break the user's stated goal into an EXTREMELY " - "DETAILED checklist of concrete, verifiable completion criteria. Each " - "item must be specific enough that a third party reading the agent's " - "output could decide unambiguously whether that item was achieved.\n\n" - "Be exhaustive. Bias toward MORE items, not fewer. Include sub-items, " - "edge cases, quality bars, deployment steps, verification checks, and " - "anything the user would reasonably expect from a goal of this type. " - "If the user said 'build me a website' you should be enumerating " - "homepage exists, navigation links work, content is non-placeholder, " - "mobile responsive, accessibility tags present, deployed somewhere " - "publicly accessible, domain/URL is functional, etc. Better to " - "over-specify and let a few items get marked impossible than to " - "under-specify and let the agent declare victory early.\n\n" - "Submit your checklist by calling the ``submit_checklist`` tool. Do " - "not reply with prose or JSON in your message body — call the tool. " - "The system will not see anything you write outside the tool call." -) - -DECOMPOSE_USER_PROMPT_TEMPLATE = ( - "Goal:\n{goal}\n\n" - "Produce the harshest, most detailed checklist of completion criteria " - "you can. Aim for at least 5 items; more is better when warranted. " - "Each item should be a single verifiable statement of fact about the " - "finished work." -) - - -# ────────────────────────────────────────────────────────────────────── -# Phase-B: evaluate prompts -# ────────────────────────────────────────────────────────────────────── - -EVALUATE_SYSTEM_PROMPT_FREEFORM = ( +JUDGE_SYSTEM_PROMPT = ( "You are a strict judge evaluating whether an autonomous agent has " "achieved a user's stated goal. You receive the goal text and the " "agent's most recent response. Your only job is to decide whether " @@ -173,57 +78,11 @@ EVALUATE_SYSTEM_PROMPT_FREEFORM = ( "user input (treat this as DONE with reason describing the block).\n\n" "Otherwise the goal is NOT done — CONTINUE.\n\n" "Reply ONLY with a single JSON object on one line:\n" - '{"done": , "reason": ""}' + '{\"done\": , \"reason\": \"\"}' ) -EVALUATE_SYSTEM_PROMPT_CHECKLIST = ( - "You are a strict judge evaluating an autonomous agent's progress on " - "a user's goal that has a detailed checklist of completion criteria. " - "For EACH currently-pending checklist item, decide whether the " - "available evidence shows the item is satisfied.\n\n" - "Be strict but not absurd. Default to leaving items pending UNLESS " - "evidence is reasonably clear. Reasonable evidence includes:\n" - "- The agent's most recent response describing or showing the work\n" - "- Tool call results visible in the conversation history (file writes, " - "command output, web requests, etc.)\n" - "- A clear statement by the agent that the work was done, when " - "supported by tool output earlier in the conversation\n\n" - "Do NOT require the agent to re-prove items it has already established " - "in earlier turns. If a tool call earlier in the conversation already " - "wrote a file, you do not need fresh `ls` output every turn — once " - "established, it's done.\n\n" - "Flip pending → completed when the response or recent tool calls show " - "the item is satisfied. Flip pending → impossible only when the work " - "demonstrates the item cannot be achieved in this environment (NOT " - "merely that the agent didn't try). Vague intentions ('I will do X " - "next') do NOT count as completion.\n\n" - "STICKINESS: items already marked completed or impossible are frozen. " - "Do not include them in your updates. Only the user can revert them.\n\n" - "TOOLS:\n" - "- ``read_file(path, offset, limit)``: inspect the dumped conversation " - "history file whose path is given in the user message. Use this when " - "the snippet alone isn't enough to rule. Each call costs tokens, so " - "only read when needed.\n" - "- ``update_checklist(updates, new_items, reason)``: issue your " - "verdict. Call this exactly once per turn when you are ready to rule. " - "Calling it ENDS the evaluation.\n\n" - "You MUST call one of these tools every turn. Do not reply with " - "prose or JSON in your message body — the system will not see " - "anything written outside tool calls. When you cite evidence, " - "reference the agent's actual output specifically." -) -EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE = ( - "Goal:\n{goal}\n\n" - "Current checklist (each item is numbered, 1-based — use these " - "exact 1-based numbers as the ``index`` field in your updates):\n{checklist_block}\n\n" - "Agent's most recent response (snippet):\n{response}\n\n" - "Conversation history file (call read_file on this path if you need " - "more context — pagination supported via offset/limit):\n{history_path}\n\n" - "Evaluate each pending item. Cite specific evidence." -) - -EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE = ( +JUDGE_USER_PROMPT_TEMPLATE = ( "Goal:\n{goal}\n\n" "Agent's most recent response:\n{response}\n\n" "Is the goal satisfied?" @@ -231,55 +90,16 @@ EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE = ( # ────────────────────────────────────────────────────────────────────── -# Dataclasses +# Dataclass # ────────────────────────────────────────────────────────────────────── -@dataclass -class ChecklistItem: - """One concrete completion criterion attached to a goal.""" - - text: str - status: str = ITEM_PENDING # pending | completed | impossible - added_by: str = ADDED_BY_JUDGE # judge | user - added_at: float = 0.0 - completed_at: Optional[float] = None - evidence: Optional[str] = None # judge's rationale on flip - - def to_dict(self) -> Dict[str, Any]: - return asdict(self) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "ChecklistItem": - text = str(data.get("text", "")).strip() - if not text: - text = "(empty item)" - status = str(data.get("status", ITEM_PENDING)).strip().lower() - if status not in VALID_ITEM_STATUSES: - status = ITEM_PENDING - added_by = str(data.get("added_by", ADDED_BY_JUDGE)).strip().lower() - if added_by not in (ADDED_BY_JUDGE, ADDED_BY_USER): - added_by = ADDED_BY_JUDGE - return cls( - text=text, - status=status, - added_by=added_by, - added_at=float(data.get("added_at", 0.0) or 0.0), - completed_at=( - float(data["completed_at"]) - if data.get("completed_at") is not None - else None - ), - evidence=data.get("evidence"), - ) - - @dataclass class GoalState: """Serializable goal state stored per session.""" goal: str - status: str = "active" # active | paused | done | cleared + status: str = "active" # active | paused | done | cleared turns_used: int = 0 max_turns: int = DEFAULT_MAX_TURNS created_at: float = 0.0 @@ -288,28 +108,13 @@ class GoalState: last_reason: Optional[str] = None paused_reason: Optional[str] = None # why we auto-paused (budget, etc.) consecutive_parse_failures: int = 0 # judge-output parse failures in a row - # Checklist mode (added 2026-05). Both fields default safely so old - # state_meta rows load unchanged. - checklist: List[ChecklistItem] = field(default_factory=list) - decomposed: bool = False # has Phase-A run for this goal? def to_json(self) -> str: - data = asdict(self) - # asdict already serializes ChecklistItem via dataclass recursion. - return json.dumps(data, ensure_ascii=False) + return json.dumps(asdict(self), ensure_ascii=False) @classmethod def from_json(cls, raw: str) -> "GoalState": data = json.loads(raw) - raw_checklist = data.get("checklist") or [] - checklist: List[ChecklistItem] = [] - if isinstance(raw_checklist, list): - for item in raw_checklist: - if isinstance(item, dict): - try: - checklist.append(ChecklistItem.from_dict(item)) - except Exception: - continue return cls( goal=data.get("goal", ""), status=data.get("status", "active"), @@ -321,39 +126,8 @@ class GoalState: last_reason=data.get("last_reason"), paused_reason=data.get("paused_reason"), consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0), - checklist=checklist, - decomposed=bool(data.get("decomposed", False)), ) - # --- checklist helpers ------------------------------------------------ - - def checklist_counts(self) -> Tuple[int, int, int, int]: - """Return (total, completed, impossible, pending).""" - total = len(self.checklist) - completed = sum(1 for it in self.checklist if it.status == ITEM_COMPLETED) - impossible = sum(1 for it in self.checklist if it.status == ITEM_IMPOSSIBLE) - pending = total - completed - impossible - return total, completed, impossible, pending - - def all_terminal(self) -> bool: - """True iff at least one item exists and every item is in a terminal status.""" - if not self.checklist: - return False - return all(it.status in TERMINAL_ITEM_STATUSES for it in self.checklist) - - def render_checklist(self, *, numbered: bool = False) -> str: - if not self.checklist: - return "(empty)" - lines = [] - for i, item in enumerate(self.checklist, start=1): - marker = ITEM_MARKERS.get(item.status, "[?]") - prefix = f"{i}. {marker}" if numbered else f" {marker}" - line = f"{prefix} {item.text}" - if item.status == ITEM_IMPOSSIBLE and item.evidence: - line += f" (impossible: {item.evidence})" - lines.append(line) - return "\n".join(lines) - # ────────────────────────────────────────────────────────────────────── # Persistence (SessionDB state_meta) @@ -441,63 +215,7 @@ def clear_goal(session_id: str) -> None: # ────────────────────────────────────────────────────────────────────── -# Conversation-history dump (read by the judge tool loop) -# ────────────────────────────────────────────────────────────────────── - - -def _goals_dump_dir() -> Optional[Path]: - """Return ``/goals`` (created on first use), or None on error.""" - try: - from hermes_constants import get_hermes_home - - home = Path(get_hermes_home()) - except Exception as exc: - logger.debug("goals dump dir: get_hermes_home failed: %s", exc) - return None - try: - path = home / "goals" - path.mkdir(parents=True, exist_ok=True) - return path - except Exception as exc: - logger.debug("goals dump dir: mkdir failed: %s", exc) - return None - - -def _safe_session_filename(session_id: str) -> str: - """Make a session_id safe for use as a filename component.""" - cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", session_id or "unknown") - # Bound length to keep filesystem happy. - return cleaned[:128] or "unknown" - - -def conversation_dump_path(session_id: str) -> Optional[Path]: - """Where the dumped messages JSON for ``session_id`` lives.""" - base = _goals_dump_dir() - if base is None: - return None - return base / f"{_safe_session_filename(session_id)}.json" - - -def dump_conversation(session_id: str, messages: List[Dict[str, Any]]) -> Optional[Path]: - """Write ``messages`` to the goals/ dump file. Returns the path on success.""" - if not session_id or not messages: - return None - path = conversation_dump_path(session_id) - if path is None: - return None - try: - # Best-effort: messages may contain non-JSON-serializable objects from - # provider-specific adapter shims. Fall through with default=str. - with open(path, "w", encoding="utf-8") as fh: - json.dump(messages, fh, ensure_ascii=False, indent=2, default=str) - return path - except Exception as exc: - logger.debug("dump_conversation: write failed: %s", exc) - return None - - -# ────────────────────────────────────────────────────────────────────── -# Judge: parsing helpers +# Judge # ────────────────────────────────────────────────────────────────────── @@ -509,34 +227,11 @@ def _truncate(text: str, limit: int) -> str: return text[:limit] + "… [truncated]" -_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) - - -def _extract_json_object(raw: str) -> Optional[Dict[str, Any]]: - """Best-effort extraction of a single JSON object from a possibly-prosey reply.""" - if not raw: - return None - text = raw.strip() - if text.startswith("```"): - text = text.strip("`") - nl = text.find("\n") - if nl != -1: - text = text[nl + 1:] - try: - data = json.loads(text) - except Exception: - match = _JSON_OBJECT_RE.search(text) - if not match: - return None - try: - data = json.loads(match.group(0)) - except Exception: - return None - return data if isinstance(data, dict) else None +_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL) def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: - """Parse the freeform judge's reply. Fail-open to ``(False, "", parse_failed)``. + """Parse the judge's reply. Fail-open to ``(False, "", parse_failed)``. Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the judge returned output that couldn't be interpreted as the expected JSON @@ -547,8 +242,30 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: if not raw: return False, "judge returned empty response", True - data = _extract_json_object(raw) - if data is None: + text = raw.strip() + + # Strip markdown code fences the model may wrap JSON in. + if text.startswith("```"): + text = text.strip("`") + # Peel off leading json/JSON/etc tag + nl = text.find("\n") + if nl != -1: + text = text[nl + 1:] + + # First try: parse the whole blob. + data: Optional[Dict[str, Any]] = None + try: + data = json.loads(text) + except Exception: + # Second try: pull the first JSON object out. + match = _JSON_OBJECT_RE.search(text) + if match: + try: + data = json.loads(match.group(0)) + except Exception: + data = None + + if not isinstance(data, dict): return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True done_val = data.get("done") @@ -562,552 +279,49 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: return done, reason, False -def _parse_decompose_response(raw: str) -> Tuple[List[Dict[str, Any]], bool]: - """Parse a Phase-A decompose reply. Returns (items, parse_failed).""" - if not raw: - return [], True - data = _extract_json_object(raw) - if data is None: - return [], True - raw_items = data.get("checklist") - if not isinstance(raw_items, list): - return [], True - out: List[Dict[str, Any]] = [] - for item in raw_items: - if isinstance(item, dict): - text = str(item.get("text", "")).strip() - if text: - out.append({"text": text}) - elif isinstance(item, str): - text = item.strip() - if text: - out.append({"text": text}) - return out, False - - -def _parse_evaluate_response(raw: str) -> Tuple[Dict[str, Any], bool]: - """Parse a Phase-B checklist eval reply. Returns (parsed, parse_failed). - - parsed = {"updates": [...], "new_items": [...], "reason": str} - """ - if not raw: - return {"updates": [], "new_items": [], "reason": "judge returned empty response"}, True - data = _extract_json_object(raw) - if data is None: - return ( - { - "updates": [], - "new_items": [], - "reason": f"judge reply was not JSON: {_truncate(raw, 200)!r}", - }, - True, - ) - updates = data.get("updates") or [] - new_items = data.get("new_items") or [] - reason = str(data.get("reason") or "").strip() or "no reason provided" - norm_updates = [] - if isinstance(updates, list): - for upd in updates: - if not isinstance(upd, dict): - continue - try: - # Judge sees the checklist rendered with 1-based indices - # (matches the /subgoal CLI). Convert to 0-based here so the - # apply layer can index ``state.checklist`` directly. - idx_1based = int(upd.get("index")) - except (TypeError, ValueError): - continue - idx = idx_1based - 1 - status = str(upd.get("status", "")).strip().lower() - if status not in TERMINAL_ITEM_STATUSES: - # Phase-B only accepts terminal flips. Pending → pending is a no-op. - continue - evidence = str(upd.get("evidence") or "").strip() or None - norm_updates.append({"index": idx, "status": status, "evidence": evidence}) - norm_new = [] - if isinstance(new_items, list): - for it in new_items: - if isinstance(it, dict): - text = str(it.get("text", "")).strip() - if text: - norm_new.append({"text": text}) - elif isinstance(it, str): - text = it.strip() - if text: - norm_new.append({"text": text}) - return {"updates": norm_updates, "new_items": norm_new, "reason": reason}, False - - -# ────────────────────────────────────────────────────────────────────── -# Judge: read_file tool for the judge's bounded tool loop -# ────────────────────────────────────────────────────────────────────── - - -# ────────────────────────────────────────────────────────────────────── -# Judge tool schemas: read_file (history inspection) + -# submit_checklist (Phase A) + update_checklist (Phase B) -# -# Forcing the judge to emit through tool calls is dramatically more -# reliable than asking it to reply with JSON text. Most providers -# enforce the schema server-side, so weak/small judge models can no -# longer drift into prose, markdown fences, or empty bodies. -# ────────────────────────────────────────────────────────────────────── - - -_JUDGE_READ_FILE_TOOL_SCHEMA: Dict[str, Any] = { - "type": "function", - "function": { - "name": "read_file", - "description": ( - "Read a portion of the dumped conversation history JSON file. " - "Use this when the snippet alone isn't enough to rule. Returns " - "lines from the file with 1-based line numbers. Pagination " - "supported via offset and limit. Reads beyond a built-in cap " - "are truncated." - ), - "parameters": { - "type": "object", - "properties": { - "path": { - "type": "string", - "description": ( - "Absolute path to the conversation history file. " - "You were given this in the user message." - ), - }, - "offset": { - "type": "integer", - "description": "1-indexed starting line number (default 1).", - "default": 1, - }, - "limit": { - "type": "integer", - "description": ( - f"Max lines to return (default {_JUDGE_READ_FILE_MAX_LINES})." - ), - "default": _JUDGE_READ_FILE_MAX_LINES, - }, - }, - "required": ["path"], - }, - }, -} - - -_JUDGE_SUBMIT_CHECKLIST_TOOL_SCHEMA: Dict[str, Any] = { - "type": "function", - "function": { - "name": "submit_checklist", - "description": ( - "Submit the harsh, detailed completion-criteria checklist you " - "decomposed the goal into. Each item is one verifiable " - "completion criterion. Bias toward more items, not fewer." - ), - "parameters": { - "type": "object", - "properties": { - "items": { - "type": "array", - "description": ( - "List of checklist items. Each item is a single " - "verifiable statement of fact about the finished " - "work. Aim for at least 5 items; more is better " - "when warranted." - ), - "items": { - "type": "object", - "properties": { - "text": { - "type": "string", - "description": "The completion-criterion text.", - }, - }, - "required": ["text"], - }, - }, - }, - "required": ["items"], - }, - }, -} - - -_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA: Dict[str, Any] = { - "type": "function", - "function": { - "name": "update_checklist", - "description": ( - "Issue your verdict on the current checklist. For each " - "currently-pending item, decide whether the agent's most " - "recent response (and conversation history if you read it) " - "shows the item is satisfied. You may also append new items " - "the original decomposition missed. Call this exactly once " - "when you are ready to rule — calling it ends the evaluation." - ), - "parameters": { - "type": "object", - "properties": { - "updates": { - "type": "array", - "description": ( - "Per-item rulings. Use the 1-based ``index`` shown " - "in the checklist. ``status`` must be 'completed' " - "(clear evidence the item is done) or 'impossible' " - "(item cannot be achieved in this environment). " - "Items already in a terminal status are frozen — " - "do not include them." - ), - "items": { - "type": "object", - "properties": { - "index": { - "type": "integer", - "description": "1-based checklist index.", - }, - "status": { - "type": "string", - "enum": ["completed", "impossible"], - }, - "evidence": { - "type": "string", - "description": ( - "One-sentence specific citation of why " - "this item is done or impossible. " - "Reference the agent's actual output." - ), - }, - }, - "required": ["index", "status", "evidence"], - }, - }, - "new_items": { - "type": "array", - "description": ( - "Optional: completion criteria the original " - "decomposition missed. Stay strict — only add " - "items that genuinely belong as completion " - "criteria for this goal." - ), - "items": { - "type": "object", - "properties": { - "text": { - "type": "string", - "description": "The new criterion text.", - }, - }, - "required": ["text"], - }, - }, - "reason": { - "type": "string", - "description": "One-sentence overall rationale for this round of updates.", - }, - }, - "required": ["updates", "new_items", "reason"], - }, - }, -} - - -def _judge_read_file( - path: str, - *, - offset: int = 1, - limit: int = _JUDGE_READ_FILE_MAX_LINES, - allowed_path: Optional[Path] = None, -) -> str: - """Bounded read of the dumped conversation file. Returns JSON-serializable text. - - Restricted to ``allowed_path`` when provided — the judge cannot use this - tool to read arbitrary files. - """ - if not path: - return json.dumps({"error": "path is required"}) - try: - target = Path(path).resolve() - except Exception as exc: - return json.dumps({"error": f"path resolve failed: {exc}"}) - - if allowed_path is not None: - try: - allowed = allowed_path.resolve() - except Exception: - allowed = allowed_path - if target != allowed: - return json.dumps({ - "error": ( - f"read_file is restricted to the conversation dump path. " - f"Allowed: {allowed}" - ) - }) - - if not target.exists(): - return json.dumps({"error": f"file not found: {target}"}) - try: - offset = max(1, int(offset or 1)) - limit = max(1, min(int(limit or _JUDGE_READ_FILE_MAX_LINES), _JUDGE_READ_FILE_MAX_LINES)) - except (TypeError, ValueError): - return json.dumps({"error": "offset and limit must be integers"}) - - try: - with open(target, "r", encoding="utf-8", errors="replace") as fh: - lines = fh.readlines() - except Exception as exc: - return json.dumps({"error": f"read failed: {exc}"}) - - total = len(lines) - start = offset - 1 - end = min(start + limit, total) - slice_lines = lines[start:end] - out = "".join(slice_lines) - if len(out) > _JUDGE_READ_FILE_MAX_CHARS: - out = out[:_JUDGE_READ_FILE_MAX_CHARS] + "\n… [truncated by judge read cap]" - return json.dumps({ - "path": str(target), - "total_lines": total, - "offset": offset, - "returned": len(slice_lines), - "next_offset": end + 1 if end < total else None, - "content": out, - }, ensure_ascii=False) - - -# ────────────────────────────────────────────────────────────────────── -# Judge: phase-A (decompose) and phase-B (evaluate) -# ────────────────────────────────────────────────────────────────────── - - -def _get_judge_client() -> Tuple[Optional[Any], str]: - """Return (client, model) or (None, '') when unavailable.""" - try: - from agent.auxiliary_client import get_text_auxiliary_client - except Exception as exc: - logger.debug("goal judge: auxiliary client import failed: %s", exc) - return None, "" - try: - client, model = get_text_auxiliary_client("goal_judge") - except Exception as exc: - logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc) - return None, "" - if client is None or not model: - return None, "" - return client, model - - -def _extract_tool_call(msg: Any, tool_name: str) -> Optional[Dict[str, Any]]: - """Find a tool call by name on a chat-completions message. Returns - ``{"id", "name", "arguments": }`` or None. - - Robust to provider shims that return tool_calls as objects or dicts - and arguments as JSON strings or already-parsed dicts. - """ - tool_calls = getattr(msg, "tool_calls", None) or [] - for tc in tool_calls: - try: - tc_id = getattr(tc, "id", None) or (tc.get("id") if isinstance(tc, dict) else None) or "tc-?" - fn = getattr(tc, "function", None) or (tc.get("function") if isinstance(tc, dict) else None) - if fn is None: - continue - fn_name = getattr(fn, "name", None) or (fn.get("name") if isinstance(fn, dict) else "") - if fn_name != tool_name: - continue - fn_args_raw = getattr(fn, "arguments", None) or (fn.get("arguments") if isinstance(fn, dict) else "") - if isinstance(fn_args_raw, str): - try: - args = json.loads(fn_args_raw) if fn_args_raw else {} - except Exception: - args = {} - elif isinstance(fn_args_raw, dict): - args = fn_args_raw - else: - args = {} - return {"id": tc_id, "name": fn_name, "arguments": args} - except Exception: - continue - return None - - -def _serialize_assistant_tool_calls(msg: Any) -> List[Dict[str, Any]]: - """Convert a provider-shim tool_calls list into plain-dict form for - inclusion in subsequent ``messages=[...]`` payloads.""" - out: List[Dict[str, Any]] = [] - for tc in getattr(msg, "tool_calls", None) or []: - try: - tc_id = getattr(tc, "id", None) or (tc.get("id") if isinstance(tc, dict) else None) or "tc-?" - fn = getattr(tc, "function", None) or (tc.get("function") if isinstance(tc, dict) else None) - fn_name = getattr(fn, "name", None) or (fn.get("name") if isinstance(fn, dict) else "") - fn_args = getattr(fn, "arguments", None) or (fn.get("arguments") if isinstance(fn, dict) else "") - if not isinstance(fn_args, str): - try: - fn_args = json.dumps(fn_args) - except Exception: - fn_args = "{}" - out.append({ - "id": tc_id, - "type": "function", - "function": {"name": fn_name or "", "arguments": fn_args}, - }) - except Exception: - continue - return out - - -def _call_judge_with_tool_choice( - client: Any, - *, - model: str, - messages: List[Dict[str, Any]], - tools: List[Dict[str, Any]], - forced_tool_name: Optional[str], - timeout: float, - max_tokens: int = 1500, -) -> Tuple[Optional[Any], Optional[str]]: - """Call the judge with a forced tool choice, falling back to ``auto`` - if the provider rejects ``required`` / a specific function choice. - - Returns ``(response, error)``. On success, ``error`` is None. - """ - # First attempt: force the specific tool. Most modern providers - # support {"type": "function", "function": {"name": "..."}}. - primary_choice: Any - if forced_tool_name: - primary_choice = {"type": "function", "function": {"name": forced_tool_name}} - else: - primary_choice = "required" - - attempts: List[Any] = [primary_choice, "required", "auto"] - last_err: Optional[str] = None - for choice in attempts: - try: - return client.chat.completions.create( - model=model, - messages=messages, - tools=tools, - tool_choice=choice, - temperature=0, - max_tokens=max_tokens, - timeout=timeout, - ), None - except Exception as exc: - last_err = f"{type(exc).__name__}: {exc}" - # Only retry on errors that look like the provider rejecting the - # tool_choice shape. Network errors etc. should bail immediately. - msg = str(exc).lower() - if not any(token in msg for token in ( - "tool_choice", "tool choice", "required", "function call", - "unsupported", "not supported", "invalid", "400", - )): - return None, last_err - logger.debug("goal judge: tool_choice=%r rejected (%s); falling back", choice, exc) - continue - return None, last_err or "all tool_choice fallbacks failed" - - -def decompose_goal( - goal: str, - *, - timeout: float = DEFAULT_JUDGE_TIMEOUT, -) -> Tuple[List[Dict[str, Any]], Optional[str]]: - """Phase-A: ask the judge to break the goal into a checklist via a - forced ``submit_checklist`` tool call. - - Returns ``(items, error)``. On any failure, returns ``([], reason)`` - so the caller can fall back to freeform mode. - """ - if not goal.strip(): - return [], "empty goal" - - client, model = _get_judge_client() - if client is None: - return [], "auxiliary client unavailable" - - messages = [ - {"role": "system", "content": DECOMPOSE_SYSTEM_PROMPT}, - { - "role": "user", - "content": DECOMPOSE_USER_PROMPT_TEMPLATE.format( - goal=_truncate(goal, 4000) - ), - }, - ] - - resp, err = _call_judge_with_tool_choice( - client, - model=model, - messages=messages, - tools=[_JUDGE_SUBMIT_CHECKLIST_TOOL_SCHEMA], - forced_tool_name="submit_checklist", - timeout=timeout, - max_tokens=2000, - ) - if resp is None: - logger.info("goal decompose: API call failed (%s)", err) - return [], f"decompose error: {err}" - - try: - msg = resp.choices[0].message - except Exception: - return [], "decompose response malformed" - - tc = _extract_tool_call(msg, "submit_checklist") - if tc is None: - # Provider responded but didn't call the tool. Try parsing content - # as a last-ditch backstop so a fully-broken provider doesn't - # silently leave the user with no checklist at all. - content = getattr(msg, "content", "") or "" - items, parse_failed = _parse_decompose_response(content) - if parse_failed or not items: - logger.info( - "goal decompose: no submit_checklist tool call AND no parseable JSON (raw=%r)", - _truncate(content, 200), - ) - return [], "decompose: judge did not call submit_checklist" - logger.info("goal decompose: fell back to JSON-content parser (%d items)", len(items)) - return items, None - - raw_items = tc["arguments"].get("items") or [] - items: List[Dict[str, Any]] = [] - if isinstance(raw_items, list): - for entry in raw_items: - if isinstance(entry, dict): - text = str(entry.get("text", "")).strip() - if text: - items.append({"text": text}) - elif isinstance(entry, str): - text = entry.strip() - if text: - items.append({"text": text}) - - if not items: - logger.info("goal decompose: submit_checklist returned empty items list") - return [], "decompose: empty checklist" - - logger.info("goal decompose: produced %d checklist items via tool call", len(items)) - return items, None - - -def judge_goal_freeform( +def judge_goal( goal: str, last_response: str, *, timeout: float = DEFAULT_JUDGE_TIMEOUT, ) -> Tuple[str, str, bool]: - """Legacy freeform judge — kept for goals with no checklist. + """Ask the auxiliary model whether the goal is satisfied. Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``, - ``"continue"``, or ``"skipped"``. + ``"continue"``, or ``"skipped"`` (when the judge couldn't be reached). + + ``parse_failed`` is True only when the judge call succeeded but its output + was unusable (empty or non-JSON). API/transport errors return False — they + are transient and should fail-open silently. Callers use this flag to + auto-pause after N consecutive parse failures (see + ``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``). + + This is deliberately fail-open: any error returns ``("continue", "...", False)`` + so a broken judge doesn't wedge progress — the turn budget and the + consecutive-parse-failures auto-pause are the backstops. """ if not goal.strip(): return "skipped", "empty goal", False if not last_response.strip(): + # No substantive reply this turn — almost certainly not done yet. return "continue", "empty response (nothing to evaluate)", False - client, model = _get_judge_client() - if client is None: + try: + from agent.auxiliary_client import get_text_auxiliary_client + except Exception as exc: + logger.debug("goal judge: auxiliary client import failed: %s", exc) return "continue", "auxiliary client unavailable", False - prompt = EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE.format( + try: + client, model = get_text_auxiliary_client("goal_judge") + except Exception as exc: + logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc) + return "continue", "auxiliary client unavailable", False + + if client is None or not model: + return "continue", "no auxiliary client configured", False + + prompt = JUDGE_USER_PROMPT_TEMPLATE.format( goal=_truncate(goal, 2000), response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS), ) @@ -1116,7 +330,7 @@ def judge_goal_freeform( resp = client.chat.completions.create( model=model, messages=[ - {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_FREEFORM}, + {"role": "system", "content": JUDGE_SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ], temperature=0, @@ -1134,218 +348,10 @@ def judge_goal_freeform( done, reason, parse_failed = _parse_judge_response(raw) verdict = "done" if done else "continue" - logger.info("goal judge (freeform): verdict=%s reason=%s", verdict, _truncate(reason, 120)) + logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120)) return verdict, reason, parse_failed -def evaluate_checklist( - state: GoalState, - last_response: str, - *, - history_path: Optional[Path], - timeout: float = DEFAULT_JUDGE_TIMEOUT, - max_tool_calls: int = DEFAULT_MAX_JUDGE_TOOL_CALLS, -) -> Tuple[Dict[str, Any], bool]: - """Phase-B: judge evaluates each pending checklist item via forced - tool calls. - - The judge has two tools available: - - ``read_file``: inspect the dumped conversation history - - ``update_checklist``: issue the verdict (terminates the loop) - - ``tool_choice="required"`` forces one of them every iteration. We loop - until ``update_checklist`` is called or ``max_tool_calls`` is exhausted. - - Returns ``(parsed, parse_failed)`` where parsed is - ``{"updates": [...], "new_items": [...], "reason": str}``. - Falls open on transport errors: empty updates/new_items, parse_failed=False. - """ - client, model = _get_judge_client() - if client is None: - return ({"updates": [], "new_items": [], "reason": "auxiliary client unavailable"}, False) - - # Render checklist with 1-based indices the judge addresses via the - # update_checklist tool's ``index`` field. - checklist_block = state.render_checklist(numbered=True) - - user_prompt = EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE.format( - goal=_truncate(state.goal, 2000), - checklist_block=checklist_block, - response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS), - history_path=str(history_path) if history_path else "(unavailable — judge from snippet only)", - ) - - messages: List[Dict[str, Any]] = [ - {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_CHECKLIST}, - {"role": "user", "content": user_prompt}, - ] - - # Build the toolbox: read_file is only useful when we actually have a - # history file to read, so we omit it otherwise to keep the schema lean. - tools: List[Dict[str, Any]] = [_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA] - if history_path is not None: - tools.insert(0, _JUDGE_READ_FILE_TOOL_SCHEMA) - - reads_left = max(0, int(max_tool_calls)) if history_path is not None else 0 - - # Bound the overall loop generously — the judge will normally finish in - # one or two passes (read_file once, then update_checklist; or just - # update_checklist directly). - for iteration in range(reads_left + 2): - # When out of read budget, drop read_file from the toolbox so the - # judge MUST emit update_checklist. - loop_tools = tools if reads_left > 0 else [_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA] - # Forcing update_checklist directly when reads are exhausted gives - # us the strongest guarantee of termination. - forced = "update_checklist" if reads_left <= 0 else None - - resp, err = _call_judge_with_tool_choice( - client, - model=model, - messages=messages, - tools=loop_tools, - forced_tool_name=forced, - timeout=timeout, - max_tokens=1500, - ) - if resp is None: - logger.info("goal judge (checklist): API call failed (%s)", err) - return ( - { - "updates": [], - "new_items": [], - "reason": f"judge error: {err}", - }, - False, - ) - - try: - msg = resp.choices[0].message - except Exception: - return ( - {"updates": [], "new_items": [], "reason": "judge response malformed"}, - True, - ) - - # Did the judge call update_checklist? If yes, we're done. - update_tc = _extract_tool_call(msg, "update_checklist") - if update_tc is not None: - parsed = _normalize_update_args(update_tc["arguments"]) - logger.info( - "goal judge (checklist): updates=%d new_items=%d reason=%s", - len(parsed.get("updates") or []), - len(parsed.get("new_items") or []), - _truncate(parsed.get("reason", ""), 120), - ) - return parsed, False - - # Did the judge call read_file? If yes, run it and feed the result back. - read_tc = _extract_tool_call(msg, "read_file") - if read_tc is not None and reads_left > 0: - args = read_tc["arguments"] - tool_result = _judge_read_file( - str(args.get("path", "")), - offset=args.get("offset", 1), - limit=args.get("limit", _JUDGE_READ_FILE_MAX_LINES), - allowed_path=history_path, - ) - messages.append({ - "role": "assistant", - "content": getattr(msg, "content", "") or "", - "tool_calls": _serialize_assistant_tool_calls(msg), - }) - messages.append({ - "role": "tool", - "tool_call_id": read_tc["id"], - "name": "read_file", - "content": tool_result, - }) - reads_left -= 1 - continue - - # Neither tool was called. Try parsing the content body as a last- - # ditch backstop, then bail. - content = getattr(msg, "content", "") or "" - if content.strip(): - parsed, parse_failed = _parse_evaluate_response(content) - if not parse_failed: - logger.info( - "goal judge (checklist): fell back to JSON-content parser " - "updates=%d new_items=%d", - len(parsed.get("updates") or []), - len(parsed.get("new_items") or []), - ) - return parsed, False - logger.info( - "goal judge (checklist): judge emitted neither read_file nor " - "update_checklist (iteration=%d, content=%r) — bailing", - iteration, _truncate(content, 120), - ) - return ( - { - "updates": [], - "new_items": [], - "reason": "judge did not call update_checklist", - }, - True, - ) - - # Loop exhausted without an update_checklist call. - return ( - { - "updates": [], - "new_items": [], - "reason": "judge tool-loop exhausted without verdict", - }, - True, - ) - - -def _normalize_update_args(args: Dict[str, Any]) -> Dict[str, Any]: - """Validate and normalize the ``update_checklist`` tool arguments. - - Performs the same 1-based → 0-based conversion and terminal-status - filter as ``_parse_evaluate_response``. Returns the canonical - ``{updates, new_items, reason}`` shape callers expect. - """ - raw_updates = args.get("updates") or [] - raw_new = args.get("new_items") or [] - reason = str(args.get("reason") or "").strip() or "no reason provided" - - norm_updates: List[Dict[str, Any]] = [] - if isinstance(raw_updates, list): - for upd in raw_updates: - if not isinstance(upd, dict): - continue - try: - idx_1based = int(upd.get("index")) - except (TypeError, ValueError): - continue - status = str(upd.get("status", "")).strip().lower() - if status not in TERMINAL_ITEM_STATUSES: - continue - evidence = str(upd.get("evidence") or "").strip() or None - norm_updates.append({ - "index": idx_1based - 1, # 1-based → 0-based for apply layer - "status": status, - "evidence": evidence, - }) - - norm_new: List[Dict[str, Any]] = [] - if isinstance(raw_new, list): - for it in raw_new: - if isinstance(it, dict): - text = str(it.get("text", "")).strip() - if text: - norm_new.append({"text": text}) - elif isinstance(it, str): - text = it.strip() - if text: - norm_new.append({"text": text}) - - return {"updates": norm_updates, "new_items": norm_new, "reason": reason} - - # ────────────────────────────────────────────────────────────────────── # GoalManager — the orchestration surface CLI + gateway talk to # ────────────────────────────────────────────────────────────────────── @@ -1362,12 +368,8 @@ class GoalManager: - ``clear()`` — remove the active goal. - ``pause()`` / ``resume()`` — explicit user controls. - ``status()`` — printable one-liner. - - ``add_subgoal(text)`` — user appends a checklist item. - - ``mark_subgoal(index, status)`` — user flips an item (override). - - ``remove_subgoal(index)`` — user deletes an item. - - ``clear_checklist()`` — user wipes the checklist; next turn re-decomposes. - - ``evaluate_after_turn(last_response, agent=None)`` — call the judge, - update state, return a decision dict. + - ``evaluate_after_turn(last_response)`` — call the judge, update state, + and return a decision dict the caller uses to drive the next turn. - ``next_continuation_prompt()`` — the canonical user-role message to feed back into ``run_conversation``. """ @@ -1394,26 +396,14 @@ class GoalManager: if s is None or s.status in ("cleared",): return "No active goal. Set one with /goal ." turns = f"{s.turns_used}/{s.max_turns} turns" - cl_total, cl_done, cl_imp, _ = s.checklist_counts() - cl_text = "" - if cl_total: - cl_text = f", {cl_done + cl_imp}/{cl_total} done" if s.status == "active": - return f"⊙ Goal (active, {turns}{cl_text}): {s.goal}" + return f"⊙ Goal (active, {turns}): {s.goal}" if s.status == "paused": extra = f" — {s.paused_reason}" if s.paused_reason else "" - return f"⏸ Goal (paused, {turns}{cl_text}{extra}): {s.goal}" + return f"⏸ Goal (paused, {turns}{extra}): {s.goal}" if s.status == "done": - return f"✓ Goal done ({turns}{cl_text}): {s.goal}" - return f"Goal ({s.status}, {turns}{cl_text}): {s.goal}" - - def render_checklist(self) -> str: - """Public helper for the /subgoal slash command.""" - if self._state is None: - return "(no active goal)" - if not self._state.checklist: - return "(checklist empty — judge will populate it on the next turn)" - return self._state.render_checklist(numbered=True) + return f"✓ Goal done ({turns}): {s.goal}" + return f"Goal ({s.status}, {turns}): {s.goal}" # --- mutation ----------------------------------------------------- @@ -1428,8 +418,6 @@ class GoalManager: max_turns=int(max_turns) if max_turns else self.default_max_turns, created_at=time.time(), last_turn_at=0.0, - checklist=[], - decomposed=False, ) self._state = state save_goal(self.session_id, state) @@ -1468,77 +456,6 @@ class GoalManager: self._state.last_reason = reason save_goal(self.session_id, self._state) - # --- /subgoal user controls --------------------------------------- - - def add_subgoal(self, text: str) -> ChecklistItem: - """User appends a new checklist item. Requires an active or paused goal.""" - if self._state is None: - raise RuntimeError("no active goal") - text = (text or "").strip() - if not text: - raise ValueError("subgoal text is empty") - item = ChecklistItem( - text=text, - status=ITEM_PENDING, - added_by=ADDED_BY_USER, - added_at=time.time(), - ) - self._state.checklist.append(item) - save_goal(self.session_id, self._state) - return item - - def mark_subgoal(self, index_1based: int, status: str) -> ChecklistItem: - """User overrides an item's status. - - ``status`` may be ``completed``, ``impossible``, or ``pending`` - (the last only as an undo flow). Stickiness rules do NOT apply to - user actions — the user is the only authority that can revert - terminal items. - """ - if self._state is None: - raise RuntimeError("no active goal") - status = (status or "").strip().lower() - if status not in VALID_ITEM_STATUSES: - raise ValueError( - f"status must be one of {sorted(VALID_ITEM_STATUSES)}; got {status!r}" - ) - idx = int(index_1based) - 1 - if idx < 0 or idx >= len(self._state.checklist): - raise IndexError( - f"index out of range (1..{len(self._state.checklist)})" - ) - item = self._state.checklist[idx] - item.status = status - if status in TERMINAL_ITEM_STATUSES: - item.completed_at = time.time() - if not item.evidence: - item.evidence = "marked by user" - else: - item.completed_at = None - # Don't wipe judge-supplied evidence on undo — useful audit trail. - save_goal(self.session_id, self._state) - return item - - def remove_subgoal(self, index_1based: int) -> ChecklistItem: - if self._state is None: - raise RuntimeError("no active goal") - idx = int(index_1based) - 1 - if idx < 0 or idx >= len(self._state.checklist): - raise IndexError( - f"index out of range (1..{len(self._state.checklist)})" - ) - removed = self._state.checklist.pop(idx) - save_goal(self.session_id, self._state) - return removed - - def clear_checklist(self) -> None: - """Wipe the checklist and reset decomposed=False so the judge re-decomposes.""" - if self._state is None: - return - self._state.checklist = [] - self._state.decomposed = False - save_goal(self.session_id, self._state) - # --- the main entry point called after every turn ----------------- def evaluate_after_turn( @@ -1546,8 +463,6 @@ class GoalManager: last_response: str, *, user_initiated: bool = True, - agent: Any = None, - messages: Optional[List[Dict[str, Any]]] = None, ) -> Dict[str, Any]: """Run the judge and update state. Return a decision dict. @@ -1555,21 +470,11 @@ class GoalManager: continuation prompt we fed ourselves (False). Both increment ``turns_used`` because both consume model budget. - ``messages`` is the agent's full conversation list for this session. - When provided, it's dumped to ``/goals/.json`` so - the Phase-B judge's read_file tool can inspect history. Optional — - when missing, the judge runs from the snippet only. - - ``agent`` is a back-compat path — when ``messages`` is None we try - to extract them from common AIAgent attribute names. Most callers - should pass ``messages`` directly because AIAgent does not store - the message list as a public instance attribute. - Decision keys: - ``status``: current goal status after update - ``should_continue``: bool — caller should fire another turn - ``continuation_prompt``: str or None - - ``verdict``: "done" | "continue" | "skipped" | "inactive" | "decompose" + - ``verdict``: "done" | "continue" | "skipped" | "inactive" - ``reason``: str - ``message``: user-visible one-liner to print/send """ @@ -1588,45 +493,7 @@ class GoalManager: state.turns_used += 1 state.last_turn_at = time.time() - # ── Phase A: decompose (first call after /goal set) ─────────── - if not state.decomposed: - items, err = decompose_goal(state.goal) - state.decomposed = True - decompose_message = "" - if items: - now = time.time() - for entry in items: - state.checklist.append( - ChecklistItem( - text=entry["text"], - status=ITEM_PENDING, - added_by=ADDED_BY_JUDGE, - added_at=now, - ) - ) - state.last_verdict = "decompose" - state.last_reason = f"decomposed into {len(items)} items" - decompose_message = ( - f"⊙ Goal checklist created ({len(items)} items). " - f"Use /subgoal to view or edit it." - ) - save_goal(self.session_id, state) - return { - "status": "active", - "should_continue": True, - "continuation_prompt": self.next_continuation_prompt(), - "verdict": "decompose", - "reason": state.last_reason, - "message": decompose_message, - } - # Decompose failed — fall through to freeform mode below. - logger.info("goal: decompose failed (%s) — falling back to freeform judge", err) - state.last_reason = f"decompose failed: {err}" - - # ── Phase B: evaluate ──────────────────────────────────────── - verdict, reason, parse_failed = self._evaluate_state_phase_b( - state, last_response, agent=agent, messages=messages - ) + verdict, reason, parse_failed = judge_goal(state.goal, last_response) state.last_verdict = verdict state.last_reason = reason @@ -1651,7 +518,11 @@ class GoalManager: } # Auto-pause when the judge model can't produce the expected JSON - # verdict N turns in a row. + # verdict N turns in a row. Points the user at the goal_judge config + # so they can route this side task to a model that follows the + # contract (e.g. google/gemini-3-flash-preview). Without this guard, + # weak judge models burn the entire turn budget returning prose or + # empty strings. if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES: state.status = "paused" state.paused_reason = ( @@ -1693,10 +564,6 @@ class GoalManager: } save_goal(self.session_id, state) - cl_total, cl_done, cl_imp, _ = state.checklist_counts() - progress = "" - if cl_total: - progress = f" — {cl_done + cl_imp}/{cl_total} done" return { "status": "active", "should_continue": True, @@ -1704,168 +571,23 @@ class GoalManager: "verdict": "continue", "reason": reason, "message": ( - f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}{progress}): {reason}" + f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}): {reason}" ), } - def _evaluate_state_phase_b( - self, - state: GoalState, - last_response: str, - *, - agent: Any = None, - messages: Optional[List[Dict[str, Any]]] = None, - ) -> Tuple[str, str, bool]: - """Run the right kind of Phase-B evaluation given current state. - - With a non-empty checklist: harsh per-item evaluation with a bounded - read_file tool loop. - - With an empty checklist (e.g. decompose failed twice): fall back to - the legacy freeform judge so the goal still has a way to terminate. - """ - if not last_response.strip(): - return "continue", "empty response (nothing to evaluate)", False - - if state.checklist: - # Dump conversation history if we have one. Prefer explicit - # ``messages`` arg (most reliable); fall back to extracting from - # the agent instance for back-compat. - history_path: Optional[Path] = None - msgs: List[Dict[str, Any]] = [] - if messages: - msgs = list(messages) - elif agent is not None: - msgs = self._extract_agent_messages(agent) - if msgs: - history_path = dump_conversation(self.session_id, msgs) - if history_path is None: - logger.debug( - "goal: conversation dump failed for session %s", - self.session_id, - ) - else: - logger.debug( - "goal: no messages available for session %s — judge will run from snippet only", - self.session_id, - ) - - parsed, parse_failed = evaluate_checklist( - state, last_response, history_path=history_path - ) - self._apply_checklist_updates(state, parsed) - - if state.all_terminal(): - return "done", parsed.get("reason") or "all checklist items terminal", parse_failed - return "continue", parsed.get("reason") or "checklist progress", parse_failed - - # No checklist — freeform fallback. - verdict, reason, parse_failed = judge_goal_freeform(state.goal, last_response) - return verdict, reason, parse_failed - - # --- internal helpers --------------------------------------------- - - @staticmethod - def _extract_agent_messages(agent: Any) -> List[Dict[str, Any]]: - """Best-effort extraction of the agent's conversation history. - - Tries common attribute names so we don't tightly couple to AIAgent. - Returns an empty list when nothing is available. - """ - for attr in ("messages", "conversation_history", "_messages", "history"): - try: - msgs = getattr(agent, attr, None) - if isinstance(msgs, list) and msgs: - return msgs - except Exception: - continue - return [] - - @staticmethod - def _apply_checklist_updates(state: GoalState, parsed: Dict[str, Any]) -> None: - """Apply judge updates with stickiness: never regress terminal items.""" - now = time.time() - for upd in parsed.get("updates") or []: - try: - idx = int(upd["index"]) - except (KeyError, TypeError, ValueError): - continue - if idx < 0 or idx >= len(state.checklist): - continue - item = state.checklist[idx] - if item.status in TERMINAL_ITEM_STATUSES: - # Stickiness: judge cannot regress a terminal item. - continue - new_status = upd.get("status") - if new_status not in TERMINAL_ITEM_STATUSES: - continue - item.status = new_status - item.completed_at = now - evidence = upd.get("evidence") - if evidence: - item.evidence = evidence - - for new_item in parsed.get("new_items") or []: - text = (new_item.get("text") or "").strip() - if not text: - continue - state.checklist.append( - ChecklistItem( - text=text, - status=ITEM_PENDING, - added_by=ADDED_BY_JUDGE, - added_at=now, - ) - ) - - # --- continuation prompt ------------------------------------------ - def next_continuation_prompt(self) -> Optional[str]: if not self._state or self._state.status != "active": return None - if not self._state.checklist: - return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal) - cl_total, cl_done, cl_imp, _ = self._state.checklist_counts() - return CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE.format( - goal=self._state.goal, - done=cl_done + cl_imp, - total=cl_total, - checklist=self._state.render_checklist(numbered=False), - ) - - -# Public name kept for back-compat with the previous freeform-only API. -def judge_goal( - goal: str, - last_response: str, - *, - timeout: float = DEFAULT_JUDGE_TIMEOUT, -) -> Tuple[str, str, bool]: - """Back-compat wrapper — defers to the freeform judge.""" - return judge_goal_freeform(goal, last_response, timeout=timeout) + return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal) __all__ = [ - "ChecklistItem", "GoalState", "GoalManager", "CONTINUATION_PROMPT_TEMPLATE", - "CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE", "DEFAULT_MAX_TURNS", - "DEFAULT_MAX_JUDGE_TOOL_CALLS", - "ITEM_PENDING", - "ITEM_COMPLETED", - "ITEM_IMPOSSIBLE", - "ITEM_MARKERS", - "TERMINAL_ITEM_STATUSES", - "VALID_ITEM_STATUSES", "load_goal", "save_goal", "clear_goal", "judge_goal", - "judge_goal_freeform", - "decompose_goal", - "evaluate_checklist", - "conversation_dump_path", - "dump_conversation", ] diff --git a/run_agent.py b/run_agent.py index 3425c380492..cb39669ece7 100644 --- a/run_agent.py +++ b/run_agent.py @@ -10089,25 +10089,6 @@ class AIAgent: parent_session_id=old_session_id, ) self._session_db_created = True - # Forward any standing /goal state from the parent session to - # the continuation session so the goal loop survives - # auto-compression. Without this rebind, _get_goal_manager() - # constructs a fresh manager keyed on the new session_id, - # load_goal() returns None, mgr.is_active() is False, and - # the loop silently dies mid-task. The goal is stored in - # state_meta under "goal:" by hermes_cli.goals. - try: - _goal_meta_key_old = f"goal:{old_session_id}" - _goal_meta_key_new = f"goal:{self.session_id}" - _goal_blob = self._session_db.get_meta(_goal_meta_key_old) - if _goal_blob: - self._session_db.set_meta(_goal_meta_key_new, _goal_blob) - logger.info( - "goal: forwarded standing goal from %s → %s on compression", - old_session_id, self.session_id, - ) - except Exception as exc: - logger.debug("goal forward on compression failed: %s", exc) # Auto-number the title for the continuation session if old_title: try: diff --git a/tests/cli/test_cli_goal_interrupt.py b/tests/cli/test_cli_goal_interrupt.py index 879e87c6a73..851b87e856b 100644 --- a/tests/cli/test_cli_goal_interrupt.py +++ b/tests/cli/test_cli_goal_interrupt.py @@ -58,11 +58,6 @@ def _make_cli_with_goal(session_id: str, goal_text: str = "build a thing"): mgr = GoalManager(session_id=session_id, default_max_turns=5) mgr.set(goal_text) - # Skip Phase-A decompose so tests can patch judge_goal_freeform directly - # for legacy verdict assertions. - mgr.state.decomposed = True - from hermes_cli.goals import save_goal as _sg - _sg(mgr.session_id, mgr.state) cli._goal_manager = mgr return cli, mgr @@ -86,7 +81,7 @@ class TestInterruptAutoPause: # Judge MUST NOT run on an interrupted turn. If it does, we've # regressed — fail loudly instead of silently querying a mock. - with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock: + with patch("hermes_cli.goals.judge_goal") as judge_mock: judge_mock.side_effect = AssertionError( "judge_goal called on an interrupted turn" ) @@ -111,7 +106,7 @@ class TestInterruptAutoPause: cli.conversation_history = [ {"role": "assistant", "content": "partial"}, ] - with patch("hermes_cli.goals.judge_goal_freeform"): + with patch("hermes_cli.goals.judge_goal"): cli._maybe_continue_goal_after_turn() assert mgr.state.status == "paused" @@ -130,7 +125,7 @@ class TestEmptyResponseSkip: {"role": "assistant", "content": " \n\n "}, ] - with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock: + with patch("hermes_cli.goals.judge_goal") as judge_mock: judge_mock.side_effect = AssertionError( "judge_goal called on an empty response" ) @@ -149,7 +144,7 @@ class TestEmptyResponseSkip: {"role": "user", "content": "go"}, ] - with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock: + with patch("hermes_cli.goals.judge_goal") as judge_mock: judge_mock.side_effect = AssertionError( "judge_goal called without an assistant response" ) @@ -174,7 +169,7 @@ class TestHealthyTurnStillRuns: # Force the judge to say "continue" without touching the network. with patch( - "hermes_cli.goals.judge_goal_freeform", + "hermes_cli.goals.judge_goal", return_value=("continue", "needs more steps", False), ): cli._maybe_continue_goal_after_turn() @@ -194,7 +189,7 @@ class TestHealthyTurnStillRuns: ] with patch( - "hermes_cli.goals.judge_goal_freeform", + "hermes_cli.goals.judge_goal", return_value=("done", "goal satisfied", False), ): cli._maybe_continue_goal_after_turn() diff --git a/tests/gateway/test_goal_verdict_send.py b/tests/gateway/test_goal_verdict_send.py index ce9e8d6ad1e..14f536aa4f8 100644 --- a/tests/gateway/test_goal_verdict_send.py +++ b/tests/gateway/test_goal_verdict_send.py @@ -106,11 +106,8 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home): mgr = GoalManager(session_entry.session_id) mgr.set("ship the feature") - mgr.state.decomposed = True - from hermes_cli.goals import save_goal as _sg - _sg(mgr.session_id, mgr.state) - with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "the feature shipped", False)): + with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)): await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, @@ -138,11 +135,8 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home): mgr = GoalManager(session_entry.session_id) mgr.set("polish the docs") - mgr.state.decomposed = True - from hermes_cli.goals import save_goal as _sg - _sg(mgr.session_id, mgr.state) - with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "still needs work", False)): + with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)): await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, @@ -170,7 +164,7 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home): state.turns_used = 2 save_goal(session_entry.session_id, state) - with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "keep going", False)): + with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)): await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, @@ -217,7 +211,7 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home): runner.adapters[Platform.TELEGRAM] = _NoSendAdapter() - with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "ok", False)): + with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)): # must not raise await runner._post_turn_goal_continuation( session_entry=session_entry, diff --git a/tests/hermes_cli/test_goals.py b/tests/hermes_cli/test_goals.py index 680bf5ec1c8..b5afd716c9e 100644 --- a/tests/hermes_cli/test_goals.py +++ b/tests/hermes_cli/test_goals.py @@ -253,20 +253,14 @@ class TestGoalManager: assert mgr2.is_active() def test_evaluate_after_turn_done(self, hermes_home): - """Judge says done → status=done, no continuation. - - Skips Phase-A decompose by patching ``decompose_goal`` to return - an empty checklist so the manager falls through to the freeform - judge path (legacy behavior preserved when decompose is unavailable). - """ + """Judge says done → status=done, no continuation.""" from hermes_cli import goals from hermes_cli.goals import GoalManager mgr = GoalManager(session_id="eval-sid-1") mgr.set("ship it") - with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ - patch.object(goals, "judge_goal_freeform", return_value=("done", "shipped", False)): + with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)): decision = mgr.evaluate_after_turn("I shipped the feature.") assert decision["verdict"] == "done" @@ -282,8 +276,7 @@ class TestGoalManager: mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5) mgr.set("a long goal") - with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ - patch.object(goals, "judge_goal_freeform", return_value=("continue", "more work", False)): + with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)): decision = mgr.evaluate_after_turn("made some progress") assert decision["verdict"] == "continue" @@ -301,8 +294,7 @@ class TestGoalManager: mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2) mgr.set("hard goal") - with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ - patch.object(goals, "judge_goal_freeform", return_value=("continue", "not yet", False)): + with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)): d1 = mgr.evaluate_after_turn("step 1") assert d1["should_continue"] is True assert mgr.state.turns_used == 1 @@ -442,11 +434,9 @@ class TestJudgeParseFailureAutoPause: mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20) mgr.set("do a thing") - with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ - patch.object( - goals, "judge_goal_freeform", - return_value=("continue", "judge returned empty response", True), - ): + with patch.object( + goals, "judge_goal", return_value=("continue", "judge returned empty response", True) + ): d1 = mgr.evaluate_after_turn("step 1") assert d1["should_continue"] is True assert mgr.state.consecutive_parse_failures == 1 @@ -473,21 +463,17 @@ class TestJudgeParseFailureAutoPause: mgr.set("another goal") # Two parse failures… - with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ - patch.object( - goals, "judge_goal_freeform", - return_value=("continue", "not json", True), - ): + with patch.object( + goals, "judge_goal", return_value=("continue", "not json", True) + ): mgr.evaluate_after_turn("step 1") mgr.evaluate_after_turn("step 2") assert mgr.state.consecutive_parse_failures == 2 # …then one clean reply resets the counter. - with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ - patch.object( - goals, "judge_goal_freeform", - return_value=("continue", "making progress", False), - ): + with patch.object( + goals, "judge_goal", return_value=("continue", "making progress", False) + ): d = mgr.evaluate_after_turn("step 3") assert d["should_continue"] is True assert mgr.state.consecutive_parse_failures == 0 @@ -500,11 +486,9 @@ class TestJudgeParseFailureAutoPause: mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20) mgr.set("goal") - with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ - patch.object( - goals, "judge_goal_freeform", - return_value=("continue", "judge error: RuntimeError", False), - ): + with patch.object( + goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False) + ): for _ in range(5): d = mgr.evaluate_after_turn("still going") assert d["should_continue"] is True @@ -521,857 +505,12 @@ class TestJudgeParseFailureAutoPause: mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20) mgr.set("persistent goal") - with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ - patch.object( - goals, "judge_goal_freeform", - return_value=("continue", "empty", True), - ): + with patch.object( + goals, "judge_goal", return_value=("continue", "empty", True) + ): mgr.evaluate_after_turn("r") mgr.evaluate_after_turn("r") reloaded = load_goal("parse-fail-sid-4") assert reloaded is not None assert reloaded.consecutive_parse_failures == 2 - - -# ────────────────────────────────────────────────────────────────────── -# Checklist mode: GoalState backcompat + ChecklistItem -# ────────────────────────────────────────────────────────────────────── - - -class TestGoalStateBackcompat: - def test_old_state_meta_row_loads_without_checklist_fields(self): - """A goal serialized BEFORE the checklist fields existed must - round-trip through GoalState.from_json with empty defaults.""" - from hermes_cli.goals import GoalState - - legacy_json = json.dumps({ - "goal": "do the thing", - "status": "active", - "turns_used": 3, - "max_turns": 20, - "created_at": 1.0, - "last_turn_at": 2.0, - "last_verdict": "continue", - "last_reason": "still working", - "paused_reason": None, - "consecutive_parse_failures": 1, - }) - state = GoalState.from_json(legacy_json) - assert state.goal == "do the thing" - assert state.checklist == [] - assert state.decomposed is False - - def test_new_state_round_trip(self): - from hermes_cli.goals import ( - ChecklistItem, - GoalState, - ITEM_COMPLETED, - ITEM_PENDING, - ADDED_BY_JUDGE, - ADDED_BY_USER, - ) - - state = GoalState( - goal="g", - decomposed=True, - checklist=[ - ChecklistItem(text="a", status=ITEM_COMPLETED, - added_by=ADDED_BY_JUDGE, evidence="done"), - ChecklistItem(text="b", status=ITEM_PENDING, - added_by=ADDED_BY_USER), - ], - ) - round_tripped = GoalState.from_json(state.to_json()) - assert round_tripped.decomposed is True - assert len(round_tripped.checklist) == 2 - assert round_tripped.checklist[0].text == "a" - assert round_tripped.checklist[0].status == ITEM_COMPLETED - assert round_tripped.checklist[0].evidence == "done" - assert round_tripped.checklist[1].added_by == ADDED_BY_USER - - def test_checklist_counts_and_all_terminal(self): - from hermes_cli.goals import ( - ChecklistItem, GoalState, - ITEM_COMPLETED, ITEM_IMPOSSIBLE, ITEM_PENDING, - ) - - state = GoalState( - goal="g", - checklist=[ - ChecklistItem(text="a", status=ITEM_COMPLETED), - ChecklistItem(text="b", status=ITEM_IMPOSSIBLE), - ChecklistItem(text="c", status=ITEM_PENDING), - ], - ) - total, done, imp, pending = state.checklist_counts() - assert (total, done, imp, pending) == (3, 1, 1, 1) - assert state.all_terminal() is False - - state.checklist[2].status = ITEM_IMPOSSIBLE - assert state.all_terminal() is True - - def test_empty_checklist_is_not_all_terminal(self): - """Empty list must NOT be considered done.""" - from hermes_cli.goals import GoalState - - state = GoalState(goal="g") - assert state.all_terminal() is False - - -# ────────────────────────────────────────────────────────────────────── -# Phase A: decompose -# ────────────────────────────────────────────────────────────────────── - - -class TestPhaseADecompose: - def test_decompose_writes_checklist_and_marks_decomposed(self, hermes_home): - from hermes_cli import goals - from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_JUDGE - - mgr = GoalManager(session_id="phase-a-sid-1") - mgr.set("build a website") - - items = [{"text": "homepage exists"}, {"text": "is mobile-friendly"}] - with patch.object(goals, "decompose_goal", return_value=(items, None)): - d = mgr.evaluate_after_turn("(initial response)") - - assert d["verdict"] == "decompose" - assert d["should_continue"] is True - # Phase A produces a continuation prompt that includes the checklist. - assert d["continuation_prompt"] is not None - assert "Checklist progress" in d["continuation_prompt"] - assert mgr.state.decomposed is True - assert len(mgr.state.checklist) == 2 - assert mgr.state.checklist[0].text == "homepage exists" - assert mgr.state.checklist[0].status == ITEM_PENDING - assert mgr.state.checklist[0].added_by == ADDED_BY_JUDGE - - def test_decompose_only_runs_once(self, hermes_home): - """Decomposed=True after first call. Subsequent calls go to Phase B.""" - from hermes_cli import goals - from hermes_cli.goals import GoalManager - - mgr = GoalManager(session_id="phase-a-sid-2") - mgr.set("g") - - with patch.object( - goals, "decompose_goal", return_value=([{"text": "x"}], None) - ) as decompose_mock, patch.object( - goals, "evaluate_checklist", - return_value=({"updates": [], "new_items": [], "reason": "..."}, False), - ) as eval_mock: - mgr.evaluate_after_turn("turn 1") - mgr.evaluate_after_turn("turn 2") - mgr.evaluate_after_turn("turn 3") - - assert decompose_mock.call_count == 1 - assert eval_mock.call_count == 2 - - def test_decompose_failure_falls_back_to_freeform(self, hermes_home): - """If decompose returns no items, manager falls through to freeform judge.""" - from hermes_cli import goals - from hermes_cli.goals import GoalManager - - mgr = GoalManager(session_id="phase-a-sid-3") - mgr.set("g") - - with patch.object(goals, "decompose_goal", return_value=([], "model error")), \ - patch.object(goals, "judge_goal_freeform", - return_value=("done", "shipped", False)): - d = mgr.evaluate_after_turn("done!") - - assert d["verdict"] == "done" - assert mgr.state.decomposed is True - assert mgr.state.checklist == [] - - -# ────────────────────────────────────────────────────────────────────── -# Phase B: evaluate (checklist mode) -# ────────────────────────────────────────────────────────────────────── - - -class TestPhaseBChecklist: - def _make_decomposed_mgr(self, sid: str, items): - """Helper: skip Phase A, install a decomposed checklist directly.""" - from hermes_cli.goals import ( - GoalManager, ChecklistItem, ITEM_PENDING, ADDED_BY_JUDGE, - ) - from hermes_cli import goals as _g - mgr = GoalManager(session_id=sid) - mgr.set("a goal") - mgr.state.decomposed = True - mgr.state.checklist = [ - ChecklistItem(text=t, status=ITEM_PENDING, added_by=ADDED_BY_JUDGE) - for t in items - ] - _g.save_goal(sid, mgr.state) - return mgr - - def test_judge_flips_pending_to_completed(self, hermes_home): - from hermes_cli import goals - from hermes_cli.goals import ITEM_COMPLETED, ITEM_PENDING - - mgr = self._make_decomposed_mgr("phase-b-1", ["a", "b", "c"]) - with patch.object( - goals, "evaluate_checklist", - return_value=( - { - "updates": [ - {"index": 0, "status": "completed", "evidence": "done"}, - {"index": 1, "status": "completed", "evidence": "shipped"}, - ], - "new_items": [], - "reason": "made progress", - }, - False, - ), - ): - d = mgr.evaluate_after_turn("agent did stuff") - - assert d["verdict"] == "continue" - assert mgr.state.checklist[0].status == ITEM_COMPLETED - assert mgr.state.checklist[0].evidence == "done" - assert mgr.state.checklist[1].status == ITEM_COMPLETED - assert mgr.state.checklist[2].status == ITEM_PENDING - - def test_goal_done_when_all_items_terminal(self, hermes_home): - from hermes_cli import goals - - mgr = self._make_decomposed_mgr("phase-b-2", ["a", "b"]) - with patch.object( - goals, "evaluate_checklist", - return_value=( - { - "updates": [ - {"index": 0, "status": "completed", "evidence": "ok"}, - {"index": 1, "status": "impossible", "evidence": "blocked"}, - ], - "new_items": [], - "reason": "all done or blocked", - }, - False, - ), - ): - d = mgr.evaluate_after_turn("response") - - assert d["verdict"] == "done" - assert d["should_continue"] is False - assert mgr.state.status == "done" - - def test_stickiness_judge_cannot_regress_completed(self, hermes_home): - """Once an item is completed, judge updates trying to flip it back are ignored.""" - from hermes_cli import goals - from hermes_cli.goals import ITEM_COMPLETED - - mgr = self._make_decomposed_mgr("phase-b-stick", ["a"]) - # First turn completes item 0. - with patch.object( - goals, "evaluate_checklist", - return_value=( - { - "updates": [{"index": 0, "status": "completed", "evidence": "yes"}], - "new_items": [], - "reason": "done", - }, - False, - ), - ): - mgr.evaluate_after_turn("turn 1") - assert mgr.state.checklist[0].status == ITEM_COMPLETED - # Second turn: judge tries to send a non-terminal update. - # _parse_evaluate_response already filters non-terminal, but at the - # apply layer we also skip terminal items entirely. Smoke both. - with patch.object( - goals, "evaluate_checklist", - return_value=( - { - "updates": [{"index": 0, "status": "impossible", "evidence": "regress"}], - "new_items": [], - "reason": "trying to regress", - }, - False, - ), - ): - mgr.evaluate_after_turn("turn 2") - # Sticky: status stays completed, evidence unchanged. - assert mgr.state.checklist[0].status == ITEM_COMPLETED - assert mgr.state.checklist[0].evidence == "yes" - - def test_judge_appends_new_items(self, hermes_home): - from hermes_cli import goals - - mgr = self._make_decomposed_mgr("phase-b-new", ["a"]) - with patch.object( - goals, "evaluate_checklist", - return_value=( - { - "updates": [], - "new_items": [{"text": "newly discovered"}, {"text": "also this"}], - "reason": "found more work", - }, - False, - ), - ): - mgr.evaluate_after_turn("response") - assert len(mgr.state.checklist) == 3 - assert mgr.state.checklist[1].text == "newly discovered" - assert mgr.state.checklist[1].added_by == "judge" - - -# ────────────────────────────────────────────────────────────────────── -# /subgoal user controls -# ────────────────────────────────────────────────────────────────────── - - -class TestSubgoalUserControls: - def test_add_subgoal_appends_user_item(self, hermes_home): - from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_USER - - mgr = GoalManager(session_id="user-sid-1") - mgr.set("g") - item = mgr.add_subgoal("user added") - assert item.text == "user added" - assert item.status == ITEM_PENDING - assert item.added_by == ADDED_BY_USER - assert len(mgr.state.checklist) == 1 - - def test_add_subgoal_requires_active_goal(self, hermes_home): - from hermes_cli.goals import GoalManager - mgr = GoalManager(session_id="user-sid-2") - with pytest.raises(RuntimeError): - mgr.add_subgoal("x") - - def test_add_subgoal_rejects_empty_text(self, hermes_home): - from hermes_cli.goals import GoalManager - mgr = GoalManager(session_id="user-sid-3") - mgr.set("g") - with pytest.raises(ValueError): - mgr.add_subgoal(" ") - - def test_mark_subgoal_uses_1_based_index(self, hermes_home): - from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_IMPOSSIBLE - mgr = GoalManager(session_id="user-sid-4") - mgr.set("g") - mgr.add_subgoal("a") - mgr.add_subgoal("b") - mgr.add_subgoal("c") - mgr.mark_subgoal(2, "completed") - mgr.mark_subgoal(3, "impossible") - assert mgr.state.checklist[0].status == "pending" - assert mgr.state.checklist[1].status == ITEM_COMPLETED - assert mgr.state.checklist[2].status == ITEM_IMPOSSIBLE - - def test_mark_subgoal_rejects_invalid_index(self, hermes_home): - from hermes_cli.goals import GoalManager - mgr = GoalManager(session_id="user-sid-5") - mgr.set("g") - mgr.add_subgoal("a") - with pytest.raises(IndexError): - mgr.mark_subgoal(5, "completed") - with pytest.raises(IndexError): - mgr.mark_subgoal(0, "completed") - - def test_user_can_revert_terminal_item(self, hermes_home): - """User mark_subgoal bypasses stickiness — only path to revert.""" - from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_PENDING - mgr = GoalManager(session_id="user-sid-6") - mgr.set("g") - mgr.add_subgoal("a") - mgr.mark_subgoal(1, "completed") - assert mgr.state.checklist[0].status == ITEM_COMPLETED - mgr.mark_subgoal(1, "pending") - assert mgr.state.checklist[0].status == ITEM_PENDING - - def test_remove_subgoal(self, hermes_home): - from hermes_cli.goals import GoalManager - mgr = GoalManager(session_id="user-sid-7") - mgr.set("g") - mgr.add_subgoal("a") - mgr.add_subgoal("b") - mgr.add_subgoal("c") - removed = mgr.remove_subgoal(2) - assert removed.text == "b" - assert [it.text for it in mgr.state.checklist] == ["a", "c"] - - def test_clear_checklist_resets_decomposed(self, hermes_home): - from hermes_cli.goals import GoalManager - mgr = GoalManager(session_id="user-sid-8") - mgr.set("g") - mgr.state.decomposed = True - mgr.add_subgoal("a") - mgr.clear_checklist() - assert mgr.state.checklist == [] - assert mgr.state.decomposed is False - - -# ────────────────────────────────────────────────────────────────────── -# Conversation dump -# ────────────────────────────────────────────────────────────────────── - - -class TestConversationDump: - def test_dump_writes_messages_to_goals_dir(self, hermes_home): - from hermes_cli.goals import dump_conversation, conversation_dump_path - - msgs = [ - {"role": "user", "content": "hi"}, - {"role": "assistant", "content": "hello"}, - ] - path = dump_conversation("dump-sid-1", msgs) - assert path is not None - assert path.exists() - # Path is under /goals/.json - assert path.parent.name == "goals" - assert path.name == "dump-sid-1.json" - - loaded = json.loads(path.read_text()) - assert loaded == msgs - - # conversation_dump_path returns the same path - assert conversation_dump_path("dump-sid-1") == path - - def test_dump_handles_unsafe_session_id(self, hermes_home): - from hermes_cli.goals import dump_conversation - - path = dump_conversation("evil/../../sid", [{"role": "user", "content": "x"}]) - assert path is not None - # No traversal — slashes are normalized to underscores. (Periods are - # preserved because they're legitimate in filenames; the resulting - # name still cannot escape /goals/ since path - # separators are gone.) - assert "/" not in path.name - assert path.parent.name == "goals" - # Verify the resolved path stays under the goals dir. - from hermes_cli.goals import _goals_dump_dir - goals_dir = _goals_dump_dir().resolve() - assert str(path.resolve()).startswith(str(goals_dir)) - - def test_dump_skips_when_messages_empty(self, hermes_home): - from hermes_cli.goals import dump_conversation - assert dump_conversation("sid", []) is None - assert dump_conversation("", [{"role": "user", "content": "x"}]) is None - - -# ────────────────────────────────────────────────────────────────────── -# Judge read_file tool: path restriction -# ────────────────────────────────────────────────────────────────────── - - -class TestJudgeReadFile: - def test_restricted_to_allowed_path(self, hermes_home, tmp_path): - from hermes_cli.goals import _judge_read_file - - allowed = tmp_path / "allowed.json" - allowed.write_text("hello\nworld\n") - - ok = _judge_read_file(str(allowed), allowed_path=allowed) - loaded = json.loads(ok) - assert loaded["content"].startswith("hello") - - # Try to read a different file. - sneaky = tmp_path / "secret.txt" - sneaky.write_text("nope\n") - denied = _judge_read_file(str(sneaky), allowed_path=allowed) - loaded = json.loads(denied) - assert "error" in loaded - assert "restricted" in loaded["error"] - - def test_pagination(self, hermes_home, tmp_path): - from hermes_cli.goals import _judge_read_file - f = tmp_path / "big.json" - f.write_text("\n".join(f"line-{i}" for i in range(50)) + "\n") - - # offset=10, limit=5 should return lines 10..14. - result = json.loads(_judge_read_file(str(f), offset=10, limit=5, allowed_path=f)) - assert result["returned"] == 5 - assert "line-9" in result["content"] # 1-based: line 10 == zero-indexed 9 - assert result["next_offset"] == 15 - - -# ────────────────────────────────────────────────────────────────────── -# Index conversion: judge emits 1-based, apply layer uses 0-based -# ────────────────────────────────────────────────────────────────────── - - -class TestJudgeIndexConversion: - def test_parse_evaluate_converts_1based_to_0based(self): - """The judge sees the checklist with 1-based indices (rendered as - '1. [ ] foo, 2. [ ] bar'). It emits updates with those same indices. - ``_parse_evaluate_response`` must convert them to 0-based so the - apply layer can index ``state.checklist`` directly. - """ - from hermes_cli.goals import _parse_evaluate_response - - raw = ''' - {"updates": [ - {"index": 1, "status": "completed", "evidence": "first item"}, - {"index": 3, "status": "impossible", "evidence": "third item"} - ], - "new_items": [], - "reason": "evaluated"} - ''' - parsed, parse_failed = _parse_evaluate_response(raw) - assert parse_failed is False - # 1 → 0, 3 → 2 - assert [u["index"] for u in parsed["updates"]] == [0, 2] - assert parsed["updates"][0]["evidence"] == "first item" - assert parsed["updates"][1]["status"] == "impossible" - - def test_full_round_trip_judge_index_to_state(self, hermes_home): - """End-to-end: judge emits 1-based, parser converts, apply layer - flips the right items in state.checklist.""" - from hermes_cli import goals - from hermes_cli.goals import ( - GoalManager, ChecklistItem, ITEM_PENDING, ITEM_COMPLETED, - ADDED_BY_JUDGE, - ) - - mgr = GoalManager(session_id="idx-round-trip") - mgr.set("g") - mgr.state.decomposed = True - mgr.state.checklist = [ - ChecklistItem(text="first", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE), - ChecklistItem(text="second", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE), - ChecklistItem(text="third", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE), - ] - goals.save_goal("idx-round-trip", mgr.state) - - # Simulate the judge returning a raw-JSON Phase-B reply via the - # auxiliary client: the parser handles the 1-based → 0-based - # conversion so the apply layer flips item 1 (text="first"). - class FakeMessage: - content = ''' - {"updates": [{"index": 1, "status": "completed", "evidence": "first done"}], - "new_items": [], - "reason": "..."} - ''' - tool_calls = None - - class FakeChoice: - message = FakeMessage() - - class FakeResponse: - choices = [FakeChoice()] - - class FakeClient: - class chat: - class completions: - @staticmethod - def create(**kwargs): - return FakeResponse() - - with patch.object(goals, "_get_judge_client", return_value=(FakeClient, "fake-model")): - mgr.evaluate_after_turn("ran the script and item 1 is done") - - # Item 1 (text="first") should now be completed. - assert mgr.state.checklist[0].text == "first" - assert mgr.state.checklist[0].status == ITEM_COMPLETED - assert mgr.state.checklist[0].evidence == "first done" - # Other items still pending. - assert mgr.state.checklist[1].status == ITEM_PENDING - assert mgr.state.checklist[2].status == ITEM_PENDING - - -# ────────────────────────────────────────────────────────────────────── -# Compression session-rotation: goal must follow the new session_id -# ────────────────────────────────────────────────────────────────────── - - -class TestGoalSurvivesCompressionRotation: - def test_load_goal_after_session_id_rotates(self, hermes_home): - """When auto-compression rotates the session_id, the goal must be - readable from the new session_id (forwarded by run_agent's - _compress_context block). - - We don't run the full _compress_context method here — it has - ~60 dependencies. Instead we mirror exactly what that block does - with state_meta and assert the goal manager picks it up. - """ - from hermes_cli.goals import GoalManager - from hermes_state import SessionDB - - # Create a goal under a parent session_id. - parent_sid = "parent-rotate-001" - mgr = GoalManager(session_id=parent_sid) - mgr.set("survive compression") - assert mgr.is_active() - - # Simulate the run_agent._compress_context forwarding block: - # read goal:, write goal: on the same SessionDB instance. - db = SessionDB() - new_sid = "child-rotate-001" - blob = db.get_meta(f"goal:{parent_sid}") - assert blob, "goal must be in state_meta" - db.set_meta(f"goal:{new_sid}", blob) - - # New GoalManager for the rotated session_id should load the same goal. - mgr2 = GoalManager(session_id=new_sid) - assert mgr2.is_active() - assert mgr2.state.goal == "survive compression" - # Counters/checklist preserved verbatim. - assert mgr2.state.turns_used == mgr.state.turns_used - assert mgr2.state.checklist == mgr.state.checklist - - def test_no_forward_when_no_goal(self, hermes_home): - """Forwarding is a no-op when the parent session has no goal.""" - from hermes_state import SessionDB - from hermes_cli.goals import load_goal - - db = SessionDB() - # Parent has no goal at all. - assert db.get_meta("goal:parent-no-goal") is None - blob = db.get_meta("goal:parent-no-goal") - if blob: # parity with production guard - db.set_meta("goal:child-no-goal", blob) - - # Child should still have no goal. - assert load_goal("child-no-goal") is None - - -# ────────────────────────────────────────────────────────────────────── -# Forced tool-call judge: submit_checklist (Phase A) + update_checklist (Phase B) -# ────────────────────────────────────────────────────────────────────── - - -class _FakeFn: - def __init__(self, name, args): - self.name = name - self.arguments = args if isinstance(args, str) else json.dumps(args) - - -class _FakeToolCall: - def __init__(self, tc_id, name, args): - self.id = tc_id - self.type = "function" - self.function = _FakeFn(name, args) - - -class _FakeMessage: - def __init__(self, *, content="", tool_calls=None): - self.content = content - self.tool_calls = tool_calls or [] - - -class _FakeChoice: - def __init__(self, message): - self.message = message - - -class _FakeResponse: - def __init__(self, message): - self.choices = [_FakeChoice(message)] - - -def _make_fake_client(scripted_messages): - """Return a fake client whose .chat.completions.create() returns the - next scripted message each call. Mutates the underlying list as a - queue so repeat calls advance. - """ - class FakeClient: - class chat: - class completions: - _queue = list(scripted_messages) - _calls = [] - - @classmethod - def create(cls, **kwargs): - cls._calls.append(kwargs) - if not cls._queue: - raise RuntimeError("scripted-message queue exhausted") - return _FakeResponse(cls._queue.pop(0)) - - return FakeClient - - -class TestPhaseAToolCall: - def test_decompose_via_submit_checklist_tool(self, hermes_home): - from hermes_cli import goals - from hermes_cli.goals import decompose_goal - - msg = _FakeMessage( - tool_calls=[_FakeToolCall( - "tc-1", "submit_checklist", - {"items": [{"text": "first criterion"}, {"text": "second criterion"}]}, - )], - ) - client = _make_fake_client([msg]) - - with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")): - items, err = decompose_goal("build a website") - - assert err is None - assert [it["text"] for it in items] == ["first criterion", "second criterion"] - # Verify we forced the tool: tool_choice should target submit_checklist. - call = client.chat.completions._calls[0] - assert "tools" in call - assert call["tools"][0]["function"]["name"] == "submit_checklist" - # tool_choice should be either {"type":"function","function":{"name":"submit_checklist"}} - # or "required" / "auto" if a fallback was used; primary attempt forces it. - tc = call["tool_choice"] - assert ( - (isinstance(tc, dict) and tc.get("function", {}).get("name") == "submit_checklist") - or tc == "required" - or tc == "auto" - ) - - def test_decompose_falls_back_to_json_content_when_no_tool_call(self, hermes_home): - """If a broken provider returns content instead of a tool call, the - backstop JSON parser still salvages a checklist.""" - from hermes_cli import goals - from hermes_cli.goals import decompose_goal - - msg = _FakeMessage( - content='{"checklist": [{"text": "salvaged"}]}', - tool_calls=[], - ) - client = _make_fake_client([msg]) - - with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")): - items, err = decompose_goal("g") - - assert err is None - assert items == [{"text": "salvaged"}] - - def test_decompose_returns_error_when_no_tool_and_no_json(self, hermes_home): - from hermes_cli import goals - from hermes_cli.goals import decompose_goal - - msg = _FakeMessage(content="I think this should be done in stages.", tool_calls=[]) - client = _make_fake_client([msg]) - - with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")): - items, err = decompose_goal("g") - - assert items == [] - assert err and "submit_checklist" in err - - def test_decompose_drops_empty_text_items(self, hermes_home): - from hermes_cli import goals - from hermes_cli.goals import decompose_goal - - msg = _FakeMessage( - tool_calls=[_FakeToolCall( - "tc-1", "submit_checklist", - {"items": [{"text": "ok"}, {"text": ""}, {"text": " "}, {"text": "two"}]}, - )], - ) - client = _make_fake_client([msg]) - - with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")): - items, err = decompose_goal("g") - - assert err is None - assert [it["text"] for it in items] == ["ok", "two"] - - -class TestPhaseBToolCall: - def test_evaluate_via_update_checklist_tool(self, hermes_home): - from hermes_cli import goals - from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING - - state = GoalState( - goal="g", - decomposed=True, - checklist=[ - ChecklistItem(text="a", status=ITEM_PENDING), - ChecklistItem(text="b", status=ITEM_PENDING), - ], - ) - - msg = _FakeMessage( - tool_calls=[_FakeToolCall( - "tc-1", "update_checklist", - { - # 1-based indices; layer converts to 0-based. - "updates": [{"index": 1, "status": "completed", "evidence": "did a"}], - "new_items": [{"text": "discovered c"}], - "reason": "ran a", - }, - )], - ) - client = _make_fake_client([msg]) - - with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")): - parsed, parse_failed = evaluate_checklist( - state, "did the first thing", history_path=None, - ) - - assert parse_failed is False - # Index converted 1 → 0 - assert parsed["updates"] == [{"index": 0, "status": "completed", "evidence": "did a"}] - assert parsed["new_items"] == [{"text": "discovered c"}] - assert parsed["reason"] == "ran a" - - def test_evaluate_does_read_file_then_update(self, hermes_home, tmp_path): - """Phase-B tool loop: judge calls read_file once, then update_checklist.""" - from hermes_cli import goals - from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING - - # Make a real history file so the path-restriction check passes. - hist = tmp_path / "hist.json" - hist.write_text(json.dumps([{"role": "user", "content": "hi"}])) - - state = GoalState( - goal="g", - decomposed=True, - checklist=[ChecklistItem(text="a", status=ITEM_PENDING)], - ) - - msg1 = _FakeMessage(tool_calls=[_FakeToolCall( - "tc-1", "read_file", {"path": str(hist), "offset": 1, "limit": 100}, - )]) - msg2 = _FakeMessage(tool_calls=[_FakeToolCall( - "tc-2", "update_checklist", - { - "updates": [{"index": 1, "status": "completed", "evidence": "saw it"}], - "new_items": [], - "reason": "verified via read_file", - }, - )]) - client = _make_fake_client([msg1, msg2]) - - with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")): - parsed, parse_failed = evaluate_checklist( - state, "did the thing", history_path=hist, - ) - - assert parse_failed is False - assert parsed["updates"][0]["status"] == "completed" - assert parsed["reason"] == "verified via read_file" - # Two API calls — one for the read, one for the verdict. - assert len(client.chat.completions._calls) == 2 - - def test_evaluate_filters_non_terminal_status_in_tool_args(self, hermes_home): - """update_checklist should only accept 'completed' or 'impossible' — - any 'pending' updates are dropped at the normalize layer.""" - from hermes_cli import goals - from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING - - state = GoalState( - goal="g", - decomposed=True, - checklist=[ - ChecklistItem(text="a", status=ITEM_PENDING), - ChecklistItem(text="b", status=ITEM_PENDING), - ], - ) - msg = _FakeMessage(tool_calls=[_FakeToolCall( - "tc-1", "update_checklist", - { - "updates": [ - {"index": 1, "status": "completed", "evidence": "yes"}, - {"index": 2, "status": "pending", "evidence": "skip me"}, - ], - "new_items": [], - "reason": "...", - }, - )]) - client = _make_fake_client([msg]) - - with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")): - parsed, _pf = evaluate_checklist(state, "x", history_path=None) - - # Only the completed flip survives; pending update is dropped silently. - assert len(parsed["updates"]) == 1 - assert parsed["updates"][0]["index"] == 0 diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 07febffaf98..73b2fc1a811 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -3251,7 +3251,6 @@ def _run_prompt_submit(rid, sid: str, session: dict, text: Any) -> None: decision = goal_mgr.evaluate_after_turn( raw, user_initiated=True, - messages=list(session.get("history") or []), ) verdict_msg = decision.get("message") or "" if verdict_msg: