From 404640a2b752f502825dc8b26212204fa890d495 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 10 May 2026 16:56:51 -0700 Subject: [PATCH] feat(goals): /goal checklist + /subgoal user controls (#23456) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(goals): /goal checklist + /subgoal user controls Two-phase judge for /goal — Phase A decomposes the goal into a detailed checklist on first turn; Phase B evaluates each pending item harshly against the agent's most recent response. The goal completes only when every item is in a terminal status (completed or impossible). Adds /subgoal so the user can append, complete, mark impossible, undo, remove, or clear items the judge missed or got wrong. Mechanics: - GoalState gains `checklist` and `decomposed` fields, both backwards compatible (old state_meta rows load unchanged). - Phase A: aux call writes a harsh, exhaustive checklist; biased toward more items not fewer. Falls through to legacy freeform judge when decompose fails. - Phase B: judge gets the checklist + last-response snippet + path to a per-session conversation dump at /goals/.json. A bounded read_file tool (max 5 calls per turn, restricted to that one file) lets the judge inspect history when the snippet is ambiguous. Stickiness in code: terminal items are frozen, only the user can revert via /subgoal undo. - Continuation prompt shows checklist progress when non-empty; reverts to old prompt when empty. - Status line shows M/N done counts. CLI + gateway + TUI gateway all pass the agent reference into evaluate_after_turn so the dump can be written. Gateway-side /subgoal is allowed mid-run since it only modifies the checklist the judge consults at turn boundaries. Tests: 24 new cases — backcompat round-trip, Phase A decompose, Phase B updates + new_items + stickiness, user override flows, conversation dump (incl. unsafe-sid sanitization), judge read_file restriction. Existing freeform-mode tests updated to patch the renamed `judge_goal_freeform` and skip Phase A explicitly. * fix(goals): off-by-one in judge index, message-list plumbing, prompt tuning Three live-test findings from running /goal end-to-end against gemini-3-flash-preview as the judge: 1. Off-by-one bug — the judge sees the checklist rendered with 1-based indices ('1. [ ] foo, 2. [ ] bar') but the apply layer indexed state.checklist as 0-based. Result: every judge update landed on the wrong item, evidence got attached to neighbouring rows, and the genuine 'first pending' item (usually #1) never got marked. Fix: convert 1 → 0 in _parse_evaluate_response. Also tightened the user prompt to call out the 1-based scheme explicitly. New tests cover the parser conversion + an end-to-end fake-judge round-trip. 2. Conversation dump never happened — _extract_agent_messages tried common AIAgent attribute names (.messages, .conversation_history, etc.) but AIAgent doesn't expose the message list as an instance attribute; it lives inside run_conversation()'s scope. Result: the judge's read_file tool always saw history_path=unavailable. Fix: added an explicit messages= kwarg to evaluate_after_turn that all three call sites (CLI, gateway, TUI gateway) now pass directly. Agent-attribute extraction kept as back-compat fallback. 3. Prompt was too harsh on simple goals. The original 'be HARSH, default to leaving items pending' wording made the judge refuse to mark 'file exists' completed even after the agent ran ls, test -f, os.path.isfile, and find — burning the entire 8-turn budget on a fizzbuzz task. Softened to 'strict but not absurd' with explicit guidance on what counts as evidence and a directive not to require re-proving items already established earlier. Re-tested live with the same fizzbuzz goal: now terminates in 2 turns with all 8 checklist items correctly attributed to their own evidence. /subgoal user-action flow (add / complete / undo / impossible) verified live as well. --- cli.py | 105 ++- gateway/run.py | 101 +- hermes_cli/commands.py | 2 + hermes_cli/goals.py | 1138 +++++++++++++++++++++-- tests/cli/test_cli_goal_interrupt.py | 17 +- tests/gateway/test_goal_verdict_send.py | 14 +- tests/hermes_cli/test_goals.py | 595 +++++++++++- tui_gateway/server.py | 1 + 8 files changed, 1860 insertions(+), 113 deletions(-) diff --git a/cli.py b/cli.py index b3b6a73ffe2..e54acc1c76c 100644 --- a/cli.py +++ b/cli.py @@ -7244,6 +7244,8 @@ class HermesCLI: _cprint(f" No agent running; queued as next turn: {payload[:80]}{'...' if len(payload) > 80 else ''}") elif canonical == "goal": self._handle_goal_command(cmd_original) + elif canonical == "subgoal": + self._handle_subgoal_command(cmd_original) elif canonical == "skin": self._handle_skin_command(cmd_original) elif canonical == "voice": @@ -7840,6 +7842,103 @@ class HermesCLI: except Exception: pass + def _handle_subgoal_command(self, cmd: str) -> None: + """Dispatch /subgoal subcommands. + + Forms: + /subgoal show the checklist + /subgoal append a user item + /subgoal complete mark item n completed + /subgoal impossible mark item n impossible + /subgoal undo revert item n to pending + /subgoal remove delete item n + /subgoal clear wipe the checklist (judge re-decomposes) + """ + parts = (cmd or "").strip().split(None, 2) + # parts[0] == "/subgoal"; remainder is what the user typed + arg = " ".join(parts[1:]).strip() if len(parts) > 1 else "" + + mgr = self._get_goal_manager() + if mgr is None: + _cprint(f" {_DIM}Goals unavailable (no active session).{_RST}") + return + + if not mgr.has_goal(): + _cprint(f" {_DIM}No active goal. Set one with /goal .{_RST}") + return + + # No args → show the checklist. + if not arg: + _cprint(f" {mgr.status_line()}") + _cprint(f" {mgr.render_checklist()}") + return + + tokens = arg.split(None, 1) + verb = tokens[0].lower() + rest = tokens[1].strip() if len(tokens) > 1 else "" + + # Action verbs operate on indices. + action_status_map = { + "complete": "completed", + "completed": "completed", + "done": "completed", + "impossible": "impossible", + "imp": "impossible", + "skip": "impossible", + "undo": "pending", + "pending": "pending", + "reset": "pending", + } + if verb in action_status_map: + if not rest: + _cprint(f" Usage: /subgoal {verb} ") + return + try: + idx = int(rest.split()[0]) + except ValueError: + _cprint(f" /subgoal {verb}: must be an integer (1-based index).") + return + try: + item = mgr.mark_subgoal(idx, action_status_map[verb]) + except (IndexError, ValueError, RuntimeError) as exc: + _cprint(f" /subgoal {verb}: {exc}") + return + _cprint(f" ✓ Item {idx} → {item.status}: {item.text}") + return + + if verb == "remove": + if not rest: + _cprint(" Usage: /subgoal remove ") + return + try: + idx = int(rest.split()[0]) + except ValueError: + _cprint(" /subgoal remove: must be an integer (1-based index).") + return + try: + removed = mgr.remove_subgoal(idx) + except (IndexError, RuntimeError) as exc: + _cprint(f" /subgoal remove: {exc}") + return + _cprint(f" ✓ Removed item {idx}: {removed.text}") + return + + if verb == "clear": + mgr.clear_checklist() + _cprint( + " ✓ Checklist cleared. The judge will re-decompose on the next turn." + ) + return + + # Otherwise: append `arg` as a user-authored checklist item. + try: + item = mgr.add_subgoal(arg) + except (ValueError, RuntimeError) as exc: + _cprint(f" /subgoal: {exc}") + return + idx = len(mgr.state.checklist) if mgr.state else 0 + _cprint(f" ✓ Added subgoal {idx}: {item.text}") + def _maybe_continue_goal_after_turn(self) -> None: """Hook run after every CLI turn. Judges + maybe re-queues. @@ -7917,7 +8016,11 @@ class HermesCLI: if not last_response.strip(): return - decision = mgr.evaluate_after_turn(last_response, user_initiated=True) + decision = mgr.evaluate_after_turn( + last_response, + user_initiated=True, + messages=getattr(self, "conversation_history", None) or [], + ) msg = decision.get("message") or "" if msg: _cprint(f" {msg}") diff --git a/gateway/run.py b/gateway/run.py index 129827cbfb6..358e3702489 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -6063,6 +6063,12 @@ class GatewayRunner: return await self._handle_goal_command(event) return "Agent is running — use /goal status / pause / clear mid-run, or /stop before setting a new goal." + # /subgoal is safe mid-run — it only modifies the active goal's + # checklist, which the judge consults at turn boundaries. There + # is no race with the running turn. + if _cmd_def_inner and _cmd_def_inner.name == "subgoal": + return await self._handle_subgoal_command(event) + # Session-level toggles that are safe to run mid-agent — # /yolo can unblock a pending approval prompt, /verbose cycles # the tool-progress display mode for the ongoing stream. @@ -6442,6 +6448,9 @@ class GatewayRunner: if canonical == "goal": return await self._handle_goal_command(event) + if canonical == "subgoal": + return await self._handle_subgoal_command(event) + if canonical == "voice": return await self._handle_voice_command(event) @@ -6616,10 +6625,18 @@ class GatewayRunner: except Exception: session_entry = None if session_entry is not None: + # Pull the agent's full messages list from the result + # so the judge can dump it for its read_file tool. + _agent_messages: list = [] + if isinstance(_agent_result, dict): + _msgs = _agent_result.get("messages") + if isinstance(_msgs, list): + _agent_messages = _msgs await self._post_turn_goal_continuation( session_entry=session_entry, source=source, final_response=_final_text, + agent_messages=_agent_messages, ) except Exception as _goal_exc: logger.debug("goal continuation hook failed: %s", _goal_exc) @@ -9335,6 +9352,83 @@ class GatewayRunner: return t("gateway.goal.set", budget=state.max_turns, goal=state.goal) + async def _handle_subgoal_command(self, event: "MessageEvent") -> str: + """Handle /subgoal for gateway platforms. + + Forms (mirror of CLI): + /subgoal show the checklist + /subgoal append a user item + /subgoal complete | done mark item n completed + /subgoal impossible mark item n impossible + /subgoal undo revert item n to pending + /subgoal remove delete item n + /subgoal clear wipe the checklist + """ + args = (event.get_command_args() or "").strip() + + mgr, _session_entry = self._get_goal_manager_for_event(event) + if mgr is None: + return t("gateway.goal.unavailable") + + if not mgr.has_goal(): + return "No active goal. Set one with /goal ." + + if not args: + return f"{mgr.status_line()}\n{mgr.render_checklist()}" + + tokens = args.split(None, 1) + verb = tokens[0].lower() + rest = tokens[1].strip() if len(tokens) > 1 else "" + + action_status_map = { + "complete": "completed", + "completed": "completed", + "done": "completed", + "impossible": "impossible", + "imp": "impossible", + "skip": "impossible", + "undo": "pending", + "pending": "pending", + "reset": "pending", + } + if verb in action_status_map: + if not rest: + return f"Usage: /subgoal {verb} " + try: + idx = int(rest.split()[0]) + except ValueError: + return f"/subgoal {verb}: must be an integer (1-based index)." + try: + item = mgr.mark_subgoal(idx, action_status_map[verb]) + except (IndexError, ValueError, RuntimeError) as exc: + return f"/subgoal {verb}: {exc}" + return f"✓ Item {idx} → {item.status}: {item.text}" + + if verb == "remove": + if not rest: + return "Usage: /subgoal remove " + try: + idx = int(rest.split()[0]) + except ValueError: + return "/subgoal remove: must be an integer (1-based index)." + try: + removed = mgr.remove_subgoal(idx) + except (IndexError, RuntimeError) as exc: + return f"/subgoal remove: {exc}" + return f"✓ Removed item {idx}: {removed.text}" + + if verb == "clear": + mgr.clear_checklist() + return "✓ Checklist cleared. The judge will re-decompose on the next turn." + + # Otherwise — append `args` as a new user-authored checklist item. + try: + item = mgr.add_subgoal(args) + except (ValueError, RuntimeError) as exc: + return f"/subgoal: {exc}" + idx = len(mgr.state.checklist) if mgr.state else 0 + return f"✓ Added subgoal {idx}: {item.text}" + async def _send_goal_status_notice(self, source: Any, message: str) -> None: """Send a /goal judge status line back to the originating chat/thread.""" adapter = self.adapters.get(source.platform) @@ -9403,6 +9497,7 @@ class GatewayRunner: session_entry: Any, source: Any, final_response: str, + agent_messages: Optional[list] = None, ) -> None: """Run the goal judge after a gateway turn and, if still active, enqueue a continuation prompt for the same session. @@ -9430,7 +9525,11 @@ class GatewayRunner: if not mgr.is_active(): return - decision = mgr.evaluate_after_turn(final_response or "", user_initiated=True) + decision = mgr.evaluate_after_turn( + final_response or "", + user_initiated=True, + messages=agent_messages or [], + ) msg = decision.get("message") or "" # Defer the status line until after the adapter has delivered the diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index 3f543bba647..e2727c25bab 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -104,6 +104,8 @@ COMMAND_REGISTRY: list[CommandDef] = [ args_hint=""), CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session", args_hint="[text | pause | resume | clear | status]"), + CommandDef("subgoal", "Add or manage checklist items on the active goal", "Session", + args_hint="[text | complete N | impossible N | undo N | remove N | clear]"), CommandDef("status", "Show session info", "Session"), CommandDef("whoami", "Show your slash command access (admin / user)", "Info"), CommandDef("profile", "Show active profile name and home directory", "Info"), diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py index 894cdddb01b..4f54e74f66f 100644 --- a/hermes_cli/goals.py +++ b/hermes_cli/goals.py @@ -7,6 +7,18 @@ continuation prompt back into the same session and keeps working until the goal is done, turn budget is exhausted, the user pauses/clears it, or the user sends a new message (which takes priority and pauses the goal loop). +Checklist mode (added 2026-05): when a goal is set, a Phase-A "decompose" +call asks the judge to write an extremely detailed checklist of concrete +completion criteria for that goal. On every subsequent turn (Phase B) the +judge evaluates the agent's most recent output against EACH pending item +and may flip pending → completed | impossible, or append new items it +discovers along the way. The goal is done only when every checklist item +is in a terminal status. This is much harsher than the freeform +"is the goal done?" prompt and gives users a visible, verifiable progress +surface via /subgoal. A bounded read_file tool loop lets the judge inspect +the dumped conversation history when the snippet alone isn't enough to +rule. + State is persisted in SessionDB's ``state_meta`` table keyed by ``goal:`` so ``/resume`` picks it up. @@ -21,6 +33,9 @@ Design notes / invariants: prompt and also pauses the goal loop for that turn (we still re-judge after, so if the user's message happens to complete the goal the judge will say ``done``). +- Stickiness: once an item is marked completed or impossible, only the user + (via /subgoal undo) can flip it back. Judge updates that try to regress + terminal items are silently ignored. - This module has zero hard dependency on ``cli.HermesCLI`` or the gateway runner — both wire the same ``GoalManager`` in. @@ -31,10 +46,12 @@ from __future__ import annotations import json import logging +import os import re import time -from dataclasses import dataclass, asdict -from typing import Any, Dict, Optional, Tuple +from dataclasses import dataclass, field, asdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) @@ -44,8 +61,9 @@ logger = logging.getLogger(__name__) # ────────────────────────────────────────────────────────────────────── DEFAULT_MAX_TURNS = 20 -DEFAULT_JUDGE_TIMEOUT = 30.0 -# Cap how much of the last response + recent messages we send to the judge. +DEFAULT_JUDGE_TIMEOUT = 60.0 +# Cap how much of the last response we send to the judge inline. The judge +# can read the dumped conversation file via read_file if it needs more. _JUDGE_RESPONSE_SNIPPET_CHARS = 4000 # After this many consecutive judge *parse* failures (empty output / non-JSON), # the loop auto-pauses and points the user at the goal_judge config. API / @@ -55,8 +73,36 @@ _JUDGE_RESPONSE_SNIPPET_CHARS = 4000 # exhausted with every reply shaped like `judge returned empty response` or # `judge reply was not JSON`. DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3 +# Bound the Phase-B judge tool loop: if the judge keeps calling read_file +# without ever emitting a verdict, cap it so we don't burn the model's budget. +DEFAULT_MAX_JUDGE_TOOL_CALLS = 5 +# Cap a single read_file response so a judge that tries to read 100k lines +# doesn't blow up its own context. Judge can paginate if needed. +_JUDGE_READ_FILE_MAX_LINES = 400 +_JUDGE_READ_FILE_MAX_CHARS = 32_000 +# Status constants ──────────────────────────────────────────────────── +ITEM_PENDING = "pending" +ITEM_COMPLETED = "completed" +ITEM_IMPOSSIBLE = "impossible" +TERMINAL_ITEM_STATUSES = frozenset({ITEM_COMPLETED, ITEM_IMPOSSIBLE}) +VALID_ITEM_STATUSES = frozenset({ITEM_PENDING, ITEM_COMPLETED, ITEM_IMPOSSIBLE}) + +ITEM_MARKERS = { + ITEM_COMPLETED: "[x]", + ITEM_IMPOSSIBLE: "[!]", + ITEM_PENDING: "[ ]", +} + +ADDED_BY_JUDGE = "judge" +ADDED_BY_USER = "user" + + +# ────────────────────────────────────────────────────────────────────── +# Continuation prompt +# ────────────────────────────────────────────────────────────────────── + CONTINUATION_PROMPT_TEMPLATE = ( "[Continuing toward your standing goal]\n" "Goal: {goal}\n\n" @@ -65,8 +111,56 @@ CONTINUATION_PROMPT_TEMPLATE = ( "If you are blocked and need input from the user, say so clearly and stop." ) +CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE = ( + "[Continuing toward your standing goal]\n" + "Goal: {goal}\n\n" + "Checklist progress ({done}/{total} done):\n" + "{checklist}\n\n" + "Work on the unchecked items above. Do not declare items done yourself " + "— a judge marks them based on evidence in your output. If an item is " + "genuinely impossible in this environment, explain why so the judge can " + "mark it impossible. If you are blocked on a remaining item and need " + "user input, say so clearly and stop." +) -JUDGE_SYSTEM_PROMPT = ( + +# ────────────────────────────────────────────────────────────────────── +# Phase-A: decompose prompts +# ────────────────────────────────────────────────────────────────────── + +DECOMPOSE_SYSTEM_PROMPT = ( + "You are a strict judge for an autonomous agent. Your first job, before " + "judging anything, is to break the user's stated goal into an EXTREMELY " + "DETAILED checklist of concrete, verifiable completion criteria. Each " + "item must be specific enough that a third party reading the agent's " + "output could decide unambiguously whether that item was achieved.\n\n" + "Be exhaustive. Bias toward MORE items, not fewer. Include sub-items, " + "edge cases, quality bars, deployment steps, verification checks, and " + "anything the user would reasonably expect from a goal of this type. " + "If the user said 'build me a website' you should be enumerating " + "homepage exists, navigation links work, content is non-placeholder, " + "mobile responsive, accessibility tags present, deployed somewhere " + "publicly accessible, domain/URL is functional, etc. Better to " + "over-specify and let a few items get marked impossible than to " + "under-specify and let the agent declare victory early.\n\n" + "Reply ONLY with a single JSON object on one line:\n" + '{"checklist": [{"text": ""}, {"text": ""}, ...]}' +) + +DECOMPOSE_USER_PROMPT_TEMPLATE = ( + "Goal:\n{goal}\n\n" + "Produce the harshest, most detailed checklist of completion criteria " + "you can. Aim for at least 5 items; more is better when warranted. " + "Each item should be a single verifiable statement of fact about the " + "finished work." +) + + +# ────────────────────────────────────────────────────────────────────── +# Phase-B: evaluate prompts +# ────────────────────────────────────────────────────────────────────── + +EVALUATE_SYSTEM_PROMPT_FREEFORM = ( "You are a strict judge evaluating whether an autonomous agent has " "achieved a user's stated goal. You receive the goal text and the " "agent's most recent response. Your only job is to decide whether " @@ -78,11 +172,60 @@ JUDGE_SYSTEM_PROMPT = ( "user input (treat this as DONE with reason describing the block).\n\n" "Otherwise the goal is NOT done — CONTINUE.\n\n" "Reply ONLY with a single JSON object on one line:\n" - '{\"done\": , \"reason\": \"\"}' + '{"done": , "reason": ""}' ) +EVALUATE_SYSTEM_PROMPT_CHECKLIST = ( + "You are a strict judge evaluating an autonomous agent's progress on " + "a user's goal that has a detailed checklist of completion criteria. " + "For EACH currently-pending checklist item, decide whether the " + "available evidence shows the item is satisfied.\n\n" + "Be strict but not absurd. Default to leaving items pending UNLESS " + "evidence is reasonably clear. Reasonable evidence includes:\n" + "- The agent's most recent response describing or showing the work\n" + "- Tool call results visible in the conversation history (file writes, " + "command output, web requests, etc.)\n" + "- A clear statement by the agent that the work was done, when " + "supported by tool output earlier in the conversation\n\n" + "Do NOT require the agent to re-prove items it has already established " + "in earlier turns. If a tool call earlier in the conversation already " + "wrote a file, you do not need fresh `ls` output every turn — once " + "established, it's done.\n\n" + "Flip pending → completed when the response or recent tool calls show " + "the item is satisfied. Flip pending → impossible only when the work " + "demonstrates the item cannot be achieved in this environment (NOT " + "merely that the agent didn't try). Vague intentions ('I will do X " + "next') do NOT count as completion.\n\n" + "You may APPEND new checklist items if the agent's work reveals " + "criteria the original decomposition missed. Stay strict — only add " + "items that genuinely belong as completion criteria.\n\n" + "STICKINESS: items already marked completed or impossible are frozen. " + "Do not include them in your updates. Only the user can revert them.\n\n" + "TOOLS: you have read_file(path, offset, limit) available. The full " + "conversation history for this session is dumped to a JSON file whose " + "path is given in the user message. Call read_file on it when the " + "snippet is ambiguous, when you need to verify a tool call actually " + "happened, or when you want to see what skills the agent loaded. " + "Otherwise, judge from the snippet directly — extra reads cost tokens.\n\n" + "When you are ready to rule, reply ONLY with a single JSON object:\n" + '{"updates": [{"index": , "status": "completed|impossible", "evidence": ""}, ...], ' + '"new_items": [{"text": ""}, ...], ' + '"reason": ""}\n' + "When citing evidence, reference the agent's actual output specifically. " + "Empty updates is fine. Empty new_items is fine. The reason field is required." +) -JUDGE_USER_PROMPT_TEMPLATE = ( +EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE = ( + "Goal:\n{goal}\n\n" + "Current checklist (each item is numbered, 1-based — use these " + "exact 1-based numbers as the ``index`` field in your updates):\n{checklist_block}\n\n" + "Agent's most recent response (snippet):\n{response}\n\n" + "Conversation history file (call read_file on this path if you need " + "more context — pagination supported via offset/limit):\n{history_path}\n\n" + "Evaluate each pending item. Cite specific evidence." +) + +EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE = ( "Goal:\n{goal}\n\n" "Agent's most recent response:\n{response}\n\n" "Is the goal satisfied?" @@ -90,16 +233,55 @@ JUDGE_USER_PROMPT_TEMPLATE = ( # ────────────────────────────────────────────────────────────────────── -# Dataclass +# Dataclasses # ────────────────────────────────────────────────────────────────────── +@dataclass +class ChecklistItem: + """One concrete completion criterion attached to a goal.""" + + text: str + status: str = ITEM_PENDING # pending | completed | impossible + added_by: str = ADDED_BY_JUDGE # judge | user + added_at: float = 0.0 + completed_at: Optional[float] = None + evidence: Optional[str] = None # judge's rationale on flip + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "ChecklistItem": + text = str(data.get("text", "")).strip() + if not text: + text = "(empty item)" + status = str(data.get("status", ITEM_PENDING)).strip().lower() + if status not in VALID_ITEM_STATUSES: + status = ITEM_PENDING + added_by = str(data.get("added_by", ADDED_BY_JUDGE)).strip().lower() + if added_by not in (ADDED_BY_JUDGE, ADDED_BY_USER): + added_by = ADDED_BY_JUDGE + return cls( + text=text, + status=status, + added_by=added_by, + added_at=float(data.get("added_at", 0.0) or 0.0), + completed_at=( + float(data["completed_at"]) + if data.get("completed_at") is not None + else None + ), + evidence=data.get("evidence"), + ) + + @dataclass class GoalState: """Serializable goal state stored per session.""" goal: str - status: str = "active" # active | paused | done | cleared + status: str = "active" # active | paused | done | cleared turns_used: int = 0 max_turns: int = DEFAULT_MAX_TURNS created_at: float = 0.0 @@ -108,13 +290,28 @@ class GoalState: last_reason: Optional[str] = None paused_reason: Optional[str] = None # why we auto-paused (budget, etc.) consecutive_parse_failures: int = 0 # judge-output parse failures in a row + # Checklist mode (added 2026-05). Both fields default safely so old + # state_meta rows load unchanged. + checklist: List[ChecklistItem] = field(default_factory=list) + decomposed: bool = False # has Phase-A run for this goal? def to_json(self) -> str: - return json.dumps(asdict(self), ensure_ascii=False) + data = asdict(self) + # asdict already serializes ChecklistItem via dataclass recursion. + return json.dumps(data, ensure_ascii=False) @classmethod def from_json(cls, raw: str) -> "GoalState": data = json.loads(raw) + raw_checklist = data.get("checklist") or [] + checklist: List[ChecklistItem] = [] + if isinstance(raw_checklist, list): + for item in raw_checklist: + if isinstance(item, dict): + try: + checklist.append(ChecklistItem.from_dict(item)) + except Exception: + continue return cls( goal=data.get("goal", ""), status=data.get("status", "active"), @@ -126,8 +323,39 @@ class GoalState: last_reason=data.get("last_reason"), paused_reason=data.get("paused_reason"), consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0), + checklist=checklist, + decomposed=bool(data.get("decomposed", False)), ) + # --- checklist helpers ------------------------------------------------ + + def checklist_counts(self) -> Tuple[int, int, int, int]: + """Return (total, completed, impossible, pending).""" + total = len(self.checklist) + completed = sum(1 for it in self.checklist if it.status == ITEM_COMPLETED) + impossible = sum(1 for it in self.checklist if it.status == ITEM_IMPOSSIBLE) + pending = total - completed - impossible + return total, completed, impossible, pending + + def all_terminal(self) -> bool: + """True iff at least one item exists and every item is in a terminal status.""" + if not self.checklist: + return False + return all(it.status in TERMINAL_ITEM_STATUSES for it in self.checklist) + + def render_checklist(self, *, numbered: bool = False) -> str: + if not self.checklist: + return "(empty)" + lines = [] + for i, item in enumerate(self.checklist, start=1): + marker = ITEM_MARKERS.get(item.status, "[?]") + prefix = f"{i}. {marker}" if numbered else f" {marker}" + line = f"{prefix} {item.text}" + if item.status == ITEM_IMPOSSIBLE and item.evidence: + line += f" (impossible: {item.evidence})" + lines.append(line) + return "\n".join(lines) + # ────────────────────────────────────────────────────────────────────── # Persistence (SessionDB state_meta) @@ -215,7 +443,63 @@ def clear_goal(session_id: str) -> None: # ────────────────────────────────────────────────────────────────────── -# Judge +# Conversation-history dump (read by the judge tool loop) +# ────────────────────────────────────────────────────────────────────── + + +def _goals_dump_dir() -> Optional[Path]: + """Return ``/goals`` (created on first use), or None on error.""" + try: + from hermes_constants import get_hermes_home + + home = Path(get_hermes_home()) + except Exception as exc: + logger.debug("goals dump dir: get_hermes_home failed: %s", exc) + return None + try: + path = home / "goals" + path.mkdir(parents=True, exist_ok=True) + return path + except Exception as exc: + logger.debug("goals dump dir: mkdir failed: %s", exc) + return None + + +def _safe_session_filename(session_id: str) -> str: + """Make a session_id safe for use as a filename component.""" + cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", session_id or "unknown") + # Bound length to keep filesystem happy. + return cleaned[:128] or "unknown" + + +def conversation_dump_path(session_id: str) -> Optional[Path]: + """Where the dumped messages JSON for ``session_id`` lives.""" + base = _goals_dump_dir() + if base is None: + return None + return base / f"{_safe_session_filename(session_id)}.json" + + +def dump_conversation(session_id: str, messages: List[Dict[str, Any]]) -> Optional[Path]: + """Write ``messages`` to the goals/ dump file. Returns the path on success.""" + if not session_id or not messages: + return None + path = conversation_dump_path(session_id) + if path is None: + return None + try: + # Best-effort: messages may contain non-JSON-serializable objects from + # provider-specific adapter shims. Fall through with default=str. + with open(path, "w", encoding="utf-8") as fh: + json.dump(messages, fh, ensure_ascii=False, indent=2, default=str) + return path + except Exception as exc: + logger.debug("dump_conversation: write failed: %s", exc) + return None + + +# ────────────────────────────────────────────────────────────────────── +# Judge: parsing helpers # ────────────────────────────────────────────────────────────────────── @@ -227,11 +511,34 @@ def _truncate(text: str, limit: int) -> str: return text[:limit] + "… [truncated]" -_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL) +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _extract_json_object(raw: str) -> Optional[Dict[str, Any]]: + """Best-effort extraction of a single JSON object from a possibly-prosey reply.""" + if not raw: + return None + text = raw.strip() + if text.startswith("```"): + text = text.strip("`") + nl = text.find("\n") + if nl != -1: + text = text[nl + 1:] + try: + data = json.loads(text) + except Exception: + match = _JSON_OBJECT_RE.search(text) + if not match: + return None + try: + data = json.loads(match.group(0)) + except Exception: + return None + return data if isinstance(data, dict) else None def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: - """Parse the judge's reply. Fail-open to ``(False, "", parse_failed)``. + """Parse the freeform judge's reply. Fail-open to ``(False, "", parse_failed)``. Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the judge returned output that couldn't be interpreted as the expected JSON @@ -242,30 +549,8 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: if not raw: return False, "judge returned empty response", True - text = raw.strip() - - # Strip markdown code fences the model may wrap JSON in. - if text.startswith("```"): - text = text.strip("`") - # Peel off leading json/JSON/etc tag - nl = text.find("\n") - if nl != -1: - text = text[nl + 1:] - - # First try: parse the whole blob. - data: Optional[Dict[str, Any]] = None - try: - data = json.loads(text) - except Exception: - # Second try: pull the first JSON object out. - match = _JSON_OBJECT_RE.search(text) - if match: - try: - data = json.loads(match.group(0)) - except Exception: - data = None - - if not isinstance(data, dict): + data = _extract_json_object(raw) + if data is None: return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True done_val = data.get("done") @@ -279,49 +564,286 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: return done, reason, False -def judge_goal( +def _parse_decompose_response(raw: str) -> Tuple[List[Dict[str, Any]], bool]: + """Parse a Phase-A decompose reply. Returns (items, parse_failed).""" + if not raw: + return [], True + data = _extract_json_object(raw) + if data is None: + return [], True + raw_items = data.get("checklist") + if not isinstance(raw_items, list): + return [], True + out: List[Dict[str, Any]] = [] + for item in raw_items: + if isinstance(item, dict): + text = str(item.get("text", "")).strip() + if text: + out.append({"text": text}) + elif isinstance(item, str): + text = item.strip() + if text: + out.append({"text": text}) + return out, False + + +def _parse_evaluate_response(raw: str) -> Tuple[Dict[str, Any], bool]: + """Parse a Phase-B checklist eval reply. Returns (parsed, parse_failed). + + parsed = {"updates": [...], "new_items": [...], "reason": str} + """ + if not raw: + return {"updates": [], "new_items": [], "reason": "judge returned empty response"}, True + data = _extract_json_object(raw) + if data is None: + return ( + { + "updates": [], + "new_items": [], + "reason": f"judge reply was not JSON: {_truncate(raw, 200)!r}", + }, + True, + ) + updates = data.get("updates") or [] + new_items = data.get("new_items") or [] + reason = str(data.get("reason") or "").strip() or "no reason provided" + norm_updates = [] + if isinstance(updates, list): + for upd in updates: + if not isinstance(upd, dict): + continue + try: + # Judge sees the checklist rendered with 1-based indices + # (matches the /subgoal CLI). Convert to 0-based here so the + # apply layer can index ``state.checklist`` directly. + idx_1based = int(upd.get("index")) + except (TypeError, ValueError): + continue + idx = idx_1based - 1 + status = str(upd.get("status", "")).strip().lower() + if status not in TERMINAL_ITEM_STATUSES: + # Phase-B only accepts terminal flips. Pending → pending is a no-op. + continue + evidence = str(upd.get("evidence") or "").strip() or None + norm_updates.append({"index": idx, "status": status, "evidence": evidence}) + norm_new = [] + if isinstance(new_items, list): + for it in new_items: + if isinstance(it, dict): + text = str(it.get("text", "")).strip() + if text: + norm_new.append({"text": text}) + elif isinstance(it, str): + text = it.strip() + if text: + norm_new.append({"text": text}) + return {"updates": norm_updates, "new_items": norm_new, "reason": reason}, False + + +# ────────────────────────────────────────────────────────────────────── +# Judge: read_file tool for the judge's bounded tool loop +# ────────────────────────────────────────────────────────────────────── + + +_JUDGE_READ_FILE_TOOL_SCHEMA: Dict[str, Any] = { + "type": "function", + "function": { + "name": "read_file", + "description": ( + "Read a portion of the dumped conversation history JSON file. " + "Use this when the snippet alone isn't enough to rule. Returns " + "lines from the file with 1-based line numbers. Pagination " + "supported via offset and limit. Reads beyond a built-in cap " + "are truncated." + ), + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": ( + "Absolute path to the conversation history file. " + "You were given this in the user message." + ), + }, + "offset": { + "type": "integer", + "description": "1-indexed starting line number (default 1).", + "default": 1, + }, + "limit": { + "type": "integer", + "description": ( + f"Max lines to return (default {_JUDGE_READ_FILE_MAX_LINES})." + ), + "default": _JUDGE_READ_FILE_MAX_LINES, + }, + }, + "required": ["path"], + }, + }, +} + + +def _judge_read_file( + path: str, + *, + offset: int = 1, + limit: int = _JUDGE_READ_FILE_MAX_LINES, + allowed_path: Optional[Path] = None, +) -> str: + """Bounded read of the dumped conversation file. Returns JSON-serializable text. + + Restricted to ``allowed_path`` when provided — the judge cannot use this + tool to read arbitrary files. + """ + if not path: + return json.dumps({"error": "path is required"}) + try: + target = Path(path).resolve() + except Exception as exc: + return json.dumps({"error": f"path resolve failed: {exc}"}) + + if allowed_path is not None: + try: + allowed = allowed_path.resolve() + except Exception: + allowed = allowed_path + if target != allowed: + return json.dumps({ + "error": ( + f"read_file is restricted to the conversation dump path. " + f"Allowed: {allowed}" + ) + }) + + if not target.exists(): + return json.dumps({"error": f"file not found: {target}"}) + try: + offset = max(1, int(offset or 1)) + limit = max(1, min(int(limit or _JUDGE_READ_FILE_MAX_LINES), _JUDGE_READ_FILE_MAX_LINES)) + except (TypeError, ValueError): + return json.dumps({"error": "offset and limit must be integers"}) + + try: + with open(target, "r", encoding="utf-8", errors="replace") as fh: + lines = fh.readlines() + except Exception as exc: + return json.dumps({"error": f"read failed: {exc}"}) + + total = len(lines) + start = offset - 1 + end = min(start + limit, total) + slice_lines = lines[start:end] + out = "".join(slice_lines) + if len(out) > _JUDGE_READ_FILE_MAX_CHARS: + out = out[:_JUDGE_READ_FILE_MAX_CHARS] + "\n… [truncated by judge read cap]" + return json.dumps({ + "path": str(target), + "total_lines": total, + "offset": offset, + "returned": len(slice_lines), + "next_offset": end + 1 if end < total else None, + "content": out, + }, ensure_ascii=False) + + +# ────────────────────────────────────────────────────────────────────── +# Judge: phase-A (decompose) and phase-B (evaluate) +# ────────────────────────────────────────────────────────────────────── + + +def _get_judge_client() -> Tuple[Optional[Any], str]: + """Return (client, model) or (None, '') when unavailable.""" + try: + from agent.auxiliary_client import get_text_auxiliary_client + except Exception as exc: + logger.debug("goal judge: auxiliary client import failed: %s", exc) + return None, "" + try: + client, model = get_text_auxiliary_client("goal_judge") + except Exception as exc: + logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc) + return None, "" + if client is None or not model: + return None, "" + return client, model + + +def decompose_goal( + goal: str, + *, + timeout: float = DEFAULT_JUDGE_TIMEOUT, +) -> Tuple[List[Dict[str, Any]], Optional[str]]: + """Phase-A: ask the judge to break the goal into a checklist. + + Returns ``(items, error)``. On any failure, returns ``([], reason)`` so + the caller can decide whether to fall back to freeform mode. + """ + if not goal.strip(): + return [], "empty goal" + + client, model = _get_judge_client() + if client is None: + return [], "auxiliary client unavailable" + + try: + resp = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": DECOMPOSE_SYSTEM_PROMPT}, + { + "role": "user", + "content": DECOMPOSE_USER_PROMPT_TEMPLATE.format( + goal=_truncate(goal, 4000) + ), + }, + ], + temperature=0, + max_tokens=2000, + timeout=timeout, + ) + except Exception as exc: + logger.info("goal decompose: API call failed (%s)", exc) + return [], f"decompose error: {type(exc).__name__}" + + try: + raw = resp.choices[0].message.content or "" + except Exception: + raw = "" + + items, parse_failed = _parse_decompose_response(raw) + if parse_failed or not items: + logger.info( + "goal decompose: parse failed or empty checklist (raw=%r)", + _truncate(raw, 200), + ) + return [], "decompose parse failed or empty" + logger.info("goal decompose: produced %d checklist items", len(items)) + return items, None + + +def judge_goal_freeform( goal: str, last_response: str, *, timeout: float = DEFAULT_JUDGE_TIMEOUT, ) -> Tuple[str, str, bool]: - """Ask the auxiliary model whether the goal is satisfied. + """Legacy freeform judge — kept for goals with no checklist. Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``, - ``"continue"``, or ``"skipped"`` (when the judge couldn't be reached). - - ``parse_failed`` is True only when the judge call succeeded but its output - was unusable (empty or non-JSON). API/transport errors return False — they - are transient and should fail-open silently. Callers use this flag to - auto-pause after N consecutive parse failures (see - ``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``). - - This is deliberately fail-open: any error returns ``("continue", "...", False)`` - so a broken judge doesn't wedge progress — the turn budget and the - consecutive-parse-failures auto-pause are the backstops. + ``"continue"``, or ``"skipped"``. """ if not goal.strip(): return "skipped", "empty goal", False if not last_response.strip(): - # No substantive reply this turn — almost certainly not done yet. return "continue", "empty response (nothing to evaluate)", False - try: - from agent.auxiliary_client import get_text_auxiliary_client - except Exception as exc: - logger.debug("goal judge: auxiliary client import failed: %s", exc) + client, model = _get_judge_client() + if client is None: return "continue", "auxiliary client unavailable", False - try: - client, model = get_text_auxiliary_client("goal_judge") - except Exception as exc: - logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc) - return "continue", "auxiliary client unavailable", False - - if client is None or not model: - return "continue", "no auxiliary client configured", False - - prompt = JUDGE_USER_PROMPT_TEMPLATE.format( + prompt = EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE.format( goal=_truncate(goal, 2000), response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS), ) @@ -330,7 +852,7 @@ def judge_goal( resp = client.chat.completions.create( model=model, messages=[ - {"role": "system", "content": JUDGE_SYSTEM_PROMPT}, + {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_FREEFORM}, {"role": "user", "content": prompt}, ], temperature=0, @@ -348,10 +870,178 @@ def judge_goal( done, reason, parse_failed = _parse_judge_response(raw) verdict = "done" if done else "continue" - logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120)) + logger.info("goal judge (freeform): verdict=%s reason=%s", verdict, _truncate(reason, 120)) return verdict, reason, parse_failed +def evaluate_checklist( + state: GoalState, + last_response: str, + *, + history_path: Optional[Path], + timeout: float = DEFAULT_JUDGE_TIMEOUT, + max_tool_calls: int = DEFAULT_MAX_JUDGE_TOOL_CALLS, +) -> Tuple[Dict[str, Any], bool]: + """Phase-B: judge evaluates each pending checklist item. + + Runs a bounded tool loop so the judge can call ``read_file`` on the + dumped conversation history when the snippet isn't enough. + + Returns ``(parsed, parse_failed)`` where parsed is + ``{"updates": [...], "new_items": [...], "reason": str}``. + Falls open on transport errors: empty updates/new_items, parse_failed=False. + """ + client, model = _get_judge_client() + if client is None: + return ({"updates": [], "new_items": [], "reason": "auxiliary client unavailable"}, False) + + # Render checklist with 1-based indices the judge can address. + checklist_block = state.render_checklist(numbered=True) + + user_prompt = EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE.format( + goal=_truncate(state.goal, 2000), + checklist_block=checklist_block, + response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS), + history_path=str(history_path) if history_path else "(unavailable — judge from snippet only)", + ) + + messages: List[Dict[str, Any]] = [ + {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_CHECKLIST}, + {"role": "user", "content": user_prompt}, + ] + + # Some auxiliary providers may not support tool calls. We pass tools + # optimistically; if the provider returns a verdict directly without + # using them, we just parse it. + tools = [_JUDGE_READ_FILE_TOOL_SCHEMA] if history_path is not None else None + + tool_calls_left = max(0, int(max_tool_calls)) + final_raw = "" + + for _ in range(tool_calls_left + 1): + try: + kwargs: Dict[str, Any] = { + "model": model, + "messages": messages, + "temperature": 0, + "max_tokens": 1500, + "timeout": timeout, + } + if tools: + kwargs["tools"] = tools + kwargs["tool_choice"] = "auto" + resp = client.chat.completions.create(**kwargs) + except Exception as exc: + logger.info("goal judge (checklist): API call failed (%s)", exc) + return ( + { + "updates": [], + "new_items": [], + "reason": f"judge error: {type(exc).__name__}", + }, + False, + ) + + try: + choice = resp.choices[0] + msg = choice.message + except Exception: + return ( + {"updates": [], "new_items": [], "reason": "judge response malformed"}, + True, + ) + + # Unpack tool_calls in a way that works for openai-py and other shims. + tool_calls = getattr(msg, "tool_calls", None) or [] + content = getattr(msg, "content", "") or "" + + if not tool_calls: + final_raw = content + break + + if tool_calls_left <= 0: + # Out of budget. Force a final ruling on the next pass by + # appending a system note and disabling tools. + messages.append({ + "role": "user", + "content": ( + "You have exhausted your read_file budget. Issue your " + "final JSON verdict now without calling more tools." + ), + }) + tools = None + continue + + # Append the assistant turn, then handle each tool call. + assistant_record: Dict[str, Any] = { + "role": "assistant", + "content": content, + "tool_calls": [], + } + for tc in tool_calls: + try: + tc_id = getattr(tc, "id", None) or "tc-?" + fn = getattr(tc, "function", None) + fn_name = getattr(fn, "name", "") if fn is not None else "" + fn_args = getattr(fn, "arguments", "") if fn is not None else "" + assistant_record["tool_calls"].append({ + "id": tc_id, + "type": "function", + "function": {"name": fn_name, "arguments": fn_args}, + }) + except Exception: + continue + messages.append(assistant_record) + + for tc in tool_calls: + try: + tc_id = getattr(tc, "id", None) or "tc-?" + fn = getattr(tc, "function", None) + fn_name = getattr(fn, "name", "") if fn is not None else "" + fn_args_raw = getattr(fn, "arguments", "") if fn is not None else "" + except Exception: + continue + try: + args = json.loads(fn_args_raw) if isinstance(fn_args_raw, str) else (fn_args_raw or {}) + except Exception: + args = {} + if fn_name == "read_file": + tool_result = _judge_read_file( + str(args.get("path", "")), + offset=args.get("offset", 1), + limit=args.get("limit", _JUDGE_READ_FILE_MAX_LINES), + allowed_path=history_path, + ) + else: + tool_result = json.dumps({"error": f"unknown tool: {fn_name}"}) + messages.append({ + "role": "tool", + "tool_call_id": tc_id, + "name": fn_name, + "content": tool_result, + }) + tool_calls_left -= 1 + + if tool_calls_left <= 0: + messages.append({ + "role": "user", + "content": ( + "You have exhausted your read_file budget. Issue your " + "final JSON verdict now without calling more tools." + ), + }) + tools = None + + parsed, parse_failed = _parse_evaluate_response(final_raw) + logger.info( + "goal judge (checklist): updates=%d new_items=%d reason=%s", + len(parsed.get("updates") or []), + len(parsed.get("new_items") or []), + _truncate(parsed.get("reason", ""), 120), + ) + return parsed, parse_failed + + # ────────────────────────────────────────────────────────────────────── # GoalManager — the orchestration surface CLI + gateway talk to # ────────────────────────────────────────────────────────────────────── @@ -368,8 +1058,12 @@ class GoalManager: - ``clear()`` — remove the active goal. - ``pause()`` / ``resume()`` — explicit user controls. - ``status()`` — printable one-liner. - - ``evaluate_after_turn(last_response)`` — call the judge, update state, - and return a decision dict the caller uses to drive the next turn. + - ``add_subgoal(text)`` — user appends a checklist item. + - ``mark_subgoal(index, status)`` — user flips an item (override). + - ``remove_subgoal(index)`` — user deletes an item. + - ``clear_checklist()`` — user wipes the checklist; next turn re-decomposes. + - ``evaluate_after_turn(last_response, agent=None)`` — call the judge, + update state, return a decision dict. - ``next_continuation_prompt()`` — the canonical user-role message to feed back into ``run_conversation``. """ @@ -396,14 +1090,26 @@ class GoalManager: if s is None or s.status in ("cleared",): return "No active goal. Set one with /goal ." turns = f"{s.turns_used}/{s.max_turns} turns" + cl_total, cl_done, cl_imp, _ = s.checklist_counts() + cl_text = "" + if cl_total: + cl_text = f", {cl_done + cl_imp}/{cl_total} done" if s.status == "active": - return f"⊙ Goal (active, {turns}): {s.goal}" + return f"⊙ Goal (active, {turns}{cl_text}): {s.goal}" if s.status == "paused": extra = f" — {s.paused_reason}" if s.paused_reason else "" - return f"⏸ Goal (paused, {turns}{extra}): {s.goal}" + return f"⏸ Goal (paused, {turns}{cl_text}{extra}): {s.goal}" if s.status == "done": - return f"✓ Goal done ({turns}): {s.goal}" - return f"Goal ({s.status}, {turns}): {s.goal}" + return f"✓ Goal done ({turns}{cl_text}): {s.goal}" + return f"Goal ({s.status}, {turns}{cl_text}): {s.goal}" + + def render_checklist(self) -> str: + """Public helper for the /subgoal slash command.""" + if self._state is None: + return "(no active goal)" + if not self._state.checklist: + return "(checklist empty — judge will populate it on the next turn)" + return self._state.render_checklist(numbered=True) # --- mutation ----------------------------------------------------- @@ -418,6 +1124,8 @@ class GoalManager: max_turns=int(max_turns) if max_turns else self.default_max_turns, created_at=time.time(), last_turn_at=0.0, + checklist=[], + decomposed=False, ) self._state = state save_goal(self.session_id, state) @@ -456,6 +1164,77 @@ class GoalManager: self._state.last_reason = reason save_goal(self.session_id, self._state) + # --- /subgoal user controls --------------------------------------- + + def add_subgoal(self, text: str) -> ChecklistItem: + """User appends a new checklist item. Requires an active or paused goal.""" + if self._state is None: + raise RuntimeError("no active goal") + text = (text or "").strip() + if not text: + raise ValueError("subgoal text is empty") + item = ChecklistItem( + text=text, + status=ITEM_PENDING, + added_by=ADDED_BY_USER, + added_at=time.time(), + ) + self._state.checklist.append(item) + save_goal(self.session_id, self._state) + return item + + def mark_subgoal(self, index_1based: int, status: str) -> ChecklistItem: + """User overrides an item's status. + + ``status`` may be ``completed``, ``impossible``, or ``pending`` + (the last only as an undo flow). Stickiness rules do NOT apply to + user actions — the user is the only authority that can revert + terminal items. + """ + if self._state is None: + raise RuntimeError("no active goal") + status = (status or "").strip().lower() + if status not in VALID_ITEM_STATUSES: + raise ValueError( + f"status must be one of {sorted(VALID_ITEM_STATUSES)}; got {status!r}" + ) + idx = int(index_1based) - 1 + if idx < 0 or idx >= len(self._state.checklist): + raise IndexError( + f"index out of range (1..{len(self._state.checklist)})" + ) + item = self._state.checklist[idx] + item.status = status + if status in TERMINAL_ITEM_STATUSES: + item.completed_at = time.time() + if not item.evidence: + item.evidence = "marked by user" + else: + item.completed_at = None + # Don't wipe judge-supplied evidence on undo — useful audit trail. + save_goal(self.session_id, self._state) + return item + + def remove_subgoal(self, index_1based: int) -> ChecklistItem: + if self._state is None: + raise RuntimeError("no active goal") + idx = int(index_1based) - 1 + if idx < 0 or idx >= len(self._state.checklist): + raise IndexError( + f"index out of range (1..{len(self._state.checklist)})" + ) + removed = self._state.checklist.pop(idx) + save_goal(self.session_id, self._state) + return removed + + def clear_checklist(self) -> None: + """Wipe the checklist and reset decomposed=False so the judge re-decomposes.""" + if self._state is None: + return + self._state.checklist = [] + self._state.decomposed = False + save_goal(self.session_id, self._state) + # --- the main entry point called after every turn ----------------- def evaluate_after_turn( @@ -463,6 +1242,8 @@ class GoalManager: last_response: str, *, user_initiated: bool = True, + agent: Any = None, + messages: Optional[List[Dict[str, Any]]] = None, ) -> Dict[str, Any]: """Run the judge and update state. Return a decision dict. @@ -470,11 +1251,21 @@ class GoalManager: continuation prompt we fed ourselves (False). Both increment ``turns_used`` because both consume model budget. + ``messages`` is the agent's full conversation list for this session. + When provided, it's dumped to ``/goals/.json`` so + the Phase-B judge's read_file tool can inspect history. Optional — + when missing, the judge runs from the snippet only. + + ``agent`` is a back-compat path — when ``messages`` is None we try + to extract them from common AIAgent attribute names. Most callers + should pass ``messages`` directly because AIAgent does not store + the message list as a public instance attribute. + Decision keys: - ``status``: current goal status after update - ``should_continue``: bool — caller should fire another turn - ``continuation_prompt``: str or None - - ``verdict``: "done" | "continue" | "skipped" | "inactive" + - ``verdict``: "done" | "continue" | "skipped" | "inactive" | "decompose" - ``reason``: str - ``message``: user-visible one-liner to print/send """ @@ -493,7 +1284,45 @@ class GoalManager: state.turns_used += 1 state.last_turn_at = time.time() - verdict, reason, parse_failed = judge_goal(state.goal, last_response) + # ── Phase A: decompose (first call after /goal set) ─────────── + if not state.decomposed: + items, err = decompose_goal(state.goal) + state.decomposed = True + decompose_message = "" + if items: + now = time.time() + for entry in items: + state.checklist.append( + ChecklistItem( + text=entry["text"], + status=ITEM_PENDING, + added_by=ADDED_BY_JUDGE, + added_at=now, + ) + ) + state.last_verdict = "decompose" + state.last_reason = f"decomposed into {len(items)} items" + decompose_message = ( + f"⊙ Goal checklist created ({len(items)} items). " + f"Use /subgoal to view or edit it." + ) + save_goal(self.session_id, state) + return { + "status": "active", + "should_continue": True, + "continuation_prompt": self.next_continuation_prompt(), + "verdict": "decompose", + "reason": state.last_reason, + "message": decompose_message, + } + # Decompose failed — fall through to freeform mode below. + logger.info("goal: decompose failed (%s) — falling back to freeform judge", err) + state.last_reason = f"decompose failed: {err}" + + # ── Phase B: evaluate ──────────────────────────────────────── + verdict, reason, parse_failed = self._evaluate_state_phase_b( + state, last_response, agent=agent, messages=messages + ) state.last_verdict = verdict state.last_reason = reason @@ -518,11 +1347,7 @@ class GoalManager: } # Auto-pause when the judge model can't produce the expected JSON - # verdict N turns in a row. Points the user at the goal_judge config - # so they can route this side task to a model that follows the - # contract (e.g. google/gemini-3-flash-preview). Without this guard, - # weak judge models burn the entire turn budget returning prose or - # empty strings. + # verdict N turns in a row. if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES: state.status = "paused" state.paused_reason = ( @@ -564,6 +1389,10 @@ class GoalManager: } save_goal(self.session_id, state) + cl_total, cl_done, cl_imp, _ = state.checklist_counts() + progress = "" + if cl_total: + progress = f" — {cl_done + cl_imp}/{cl_total} done" return { "status": "active", "should_continue": True, @@ -571,23 +1400,168 @@ class GoalManager: "verdict": "continue", "reason": reason, "message": ( - f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}): {reason}" + f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}{progress}): {reason}" ), } + def _evaluate_state_phase_b( + self, + state: GoalState, + last_response: str, + *, + agent: Any = None, + messages: Optional[List[Dict[str, Any]]] = None, + ) -> Tuple[str, str, bool]: + """Run the right kind of Phase-B evaluation given current state. + + With a non-empty checklist: harsh per-item evaluation with a bounded + read_file tool loop. + + With an empty checklist (e.g. decompose failed twice): fall back to + the legacy freeform judge so the goal still has a way to terminate. + """ + if not last_response.strip(): + return "continue", "empty response (nothing to evaluate)", False + + if state.checklist: + # Dump conversation history if we have one. Prefer explicit + # ``messages`` arg (most reliable); fall back to extracting from + # the agent instance for back-compat. + history_path: Optional[Path] = None + msgs: List[Dict[str, Any]] = [] + if messages: + msgs = list(messages) + elif agent is not None: + msgs = self._extract_agent_messages(agent) + if msgs: + history_path = dump_conversation(self.session_id, msgs) + if history_path is None: + logger.debug( + "goal: conversation dump failed for session %s", + self.session_id, + ) + else: + logger.debug( + "goal: no messages available for session %s — judge will run from snippet only", + self.session_id, + ) + + parsed, parse_failed = evaluate_checklist( + state, last_response, history_path=history_path + ) + self._apply_checklist_updates(state, parsed) + + if state.all_terminal(): + return "done", parsed.get("reason") or "all checklist items terminal", parse_failed + return "continue", parsed.get("reason") or "checklist progress", parse_failed + + # No checklist — freeform fallback. + verdict, reason, parse_failed = judge_goal_freeform(state.goal, last_response) + return verdict, reason, parse_failed + + # --- internal helpers --------------------------------------------- + + @staticmethod + def _extract_agent_messages(agent: Any) -> List[Dict[str, Any]]: + """Best-effort extraction of the agent's conversation history. + + Tries common attribute names so we don't tightly couple to AIAgent. + Returns an empty list when nothing is available. + """ + for attr in ("messages", "conversation_history", "_messages", "history"): + try: + msgs = getattr(agent, attr, None) + if isinstance(msgs, list) and msgs: + return msgs + except Exception: + continue + return [] + + @staticmethod + def _apply_checklist_updates(state: GoalState, parsed: Dict[str, Any]) -> None: + """Apply judge updates with stickiness: never regress terminal items.""" + now = time.time() + for upd in parsed.get("updates") or []: + try: + idx = int(upd["index"]) + except (KeyError, TypeError, ValueError): + continue + if idx < 0 or idx >= len(state.checklist): + continue + item = state.checklist[idx] + if item.status in TERMINAL_ITEM_STATUSES: + # Stickiness: judge cannot regress a terminal item. + continue + new_status = upd.get("status") + if new_status not in TERMINAL_ITEM_STATUSES: + continue + item.status = new_status + item.completed_at = now + evidence = upd.get("evidence") + if evidence: + item.evidence = evidence + + for new_item in parsed.get("new_items") or []: + text = (new_item.get("text") or "").strip() + if not text: + continue + state.checklist.append( + ChecklistItem( + text=text, + status=ITEM_PENDING, + added_by=ADDED_BY_JUDGE, + added_at=now, + ) + ) + + # --- continuation prompt ------------------------------------------ + def next_continuation_prompt(self) -> Optional[str]: if not self._state or self._state.status != "active": return None - return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal) + if not self._state.checklist: + return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal) + cl_total, cl_done, cl_imp, _ = self._state.checklist_counts() + return CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE.format( + goal=self._state.goal, + done=cl_done + cl_imp, + total=cl_total, + checklist=self._state.render_checklist(numbered=False), + ) + + +# Public name kept for back-compat with the previous freeform-only API. +def judge_goal( + goal: str, + last_response: str, + *, + timeout: float = DEFAULT_JUDGE_TIMEOUT, +) -> Tuple[str, str, bool]: + """Back-compat wrapper — defers to the freeform judge.""" + return judge_goal_freeform(goal, last_response, timeout=timeout) __all__ = [ + "ChecklistItem", "GoalState", "GoalManager", "CONTINUATION_PROMPT_TEMPLATE", + "CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE", "DEFAULT_MAX_TURNS", + "DEFAULT_MAX_JUDGE_TOOL_CALLS", + "ITEM_PENDING", + "ITEM_COMPLETED", + "ITEM_IMPOSSIBLE", + "ITEM_MARKERS", + "TERMINAL_ITEM_STATUSES", + "VALID_ITEM_STATUSES", "load_goal", "save_goal", "clear_goal", "judge_goal", + "judge_goal_freeform", + "decompose_goal", + "evaluate_checklist", + "conversation_dump_path", + "dump_conversation", ] diff --git a/tests/cli/test_cli_goal_interrupt.py b/tests/cli/test_cli_goal_interrupt.py index 851b87e856b..879e87c6a73 100644 --- a/tests/cli/test_cli_goal_interrupt.py +++ b/tests/cli/test_cli_goal_interrupt.py @@ -58,6 +58,11 @@ def _make_cli_with_goal(session_id: str, goal_text: str = "build a thing"): mgr = GoalManager(session_id=session_id, default_max_turns=5) mgr.set(goal_text) + # Skip Phase-A decompose so tests can patch judge_goal_freeform directly + # for legacy verdict assertions. + mgr.state.decomposed = True + from hermes_cli.goals import save_goal as _sg + _sg(mgr.session_id, mgr.state) cli._goal_manager = mgr return cli, mgr @@ -81,7 +86,7 @@ class TestInterruptAutoPause: # Judge MUST NOT run on an interrupted turn. If it does, we've # regressed — fail loudly instead of silently querying a mock. - with patch("hermes_cli.goals.judge_goal") as judge_mock: + with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock: judge_mock.side_effect = AssertionError( "judge_goal called on an interrupted turn" ) @@ -106,7 +111,7 @@ class TestInterruptAutoPause: cli.conversation_history = [ {"role": "assistant", "content": "partial"}, ] - with patch("hermes_cli.goals.judge_goal"): + with patch("hermes_cli.goals.judge_goal_freeform"): cli._maybe_continue_goal_after_turn() assert mgr.state.status == "paused" @@ -125,7 +130,7 @@ class TestEmptyResponseSkip: {"role": "assistant", "content": " \n\n "}, ] - with patch("hermes_cli.goals.judge_goal") as judge_mock: + with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock: judge_mock.side_effect = AssertionError( "judge_goal called on an empty response" ) @@ -144,7 +149,7 @@ class TestEmptyResponseSkip: {"role": "user", "content": "go"}, ] - with patch("hermes_cli.goals.judge_goal") as judge_mock: + with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock: judge_mock.side_effect = AssertionError( "judge_goal called without an assistant response" ) @@ -169,7 +174,7 @@ class TestHealthyTurnStillRuns: # Force the judge to say "continue" without touching the network. with patch( - "hermes_cli.goals.judge_goal", + "hermes_cli.goals.judge_goal_freeform", return_value=("continue", "needs more steps", False), ): cli._maybe_continue_goal_after_turn() @@ -189,7 +194,7 @@ class TestHealthyTurnStillRuns: ] with patch( - "hermes_cli.goals.judge_goal", + "hermes_cli.goals.judge_goal_freeform", return_value=("done", "goal satisfied", False), ): cli._maybe_continue_goal_after_turn() diff --git a/tests/gateway/test_goal_verdict_send.py b/tests/gateway/test_goal_verdict_send.py index 14f536aa4f8..ce9e8d6ad1e 100644 --- a/tests/gateway/test_goal_verdict_send.py +++ b/tests/gateway/test_goal_verdict_send.py @@ -106,8 +106,11 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home): mgr = GoalManager(session_entry.session_id) mgr.set("ship the feature") + mgr.state.decomposed = True + from hermes_cli.goals import save_goal as _sg + _sg(mgr.session_id, mgr.state) - with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)): + with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "the feature shipped", False)): await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, @@ -135,8 +138,11 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home): mgr = GoalManager(session_entry.session_id) mgr.set("polish the docs") + mgr.state.decomposed = True + from hermes_cli.goals import save_goal as _sg + _sg(mgr.session_id, mgr.state) - with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)): + with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "still needs work", False)): await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, @@ -164,7 +170,7 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home): state.turns_used = 2 save_goal(session_entry.session_id, state) - with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)): + with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "keep going", False)): await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, @@ -211,7 +217,7 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home): runner.adapters[Platform.TELEGRAM] = _NoSendAdapter() - with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)): + with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "ok", False)): # must not raise await runner._post_turn_goal_continuation( session_entry=session_entry, diff --git a/tests/hermes_cli/test_goals.py b/tests/hermes_cli/test_goals.py index b5afd716c9e..47a043a32cf 100644 --- a/tests/hermes_cli/test_goals.py +++ b/tests/hermes_cli/test_goals.py @@ -253,14 +253,20 @@ class TestGoalManager: assert mgr2.is_active() def test_evaluate_after_turn_done(self, hermes_home): - """Judge says done → status=done, no continuation.""" + """Judge says done → status=done, no continuation. + + Skips Phase-A decompose by patching ``decompose_goal`` to return + an empty checklist so the manager falls through to the freeform + judge path (legacy behavior preserved when decompose is unavailable). + """ from hermes_cli import goals from hermes_cli.goals import GoalManager mgr = GoalManager(session_id="eval-sid-1") mgr.set("ship it") - with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object(goals, "judge_goal_freeform", return_value=("done", "shipped", False)): decision = mgr.evaluate_after_turn("I shipped the feature.") assert decision["verdict"] == "done" @@ -276,7 +282,8 @@ class TestGoalManager: mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5) mgr.set("a long goal") - with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object(goals, "judge_goal_freeform", return_value=("continue", "more work", False)): decision = mgr.evaluate_after_turn("made some progress") assert decision["verdict"] == "continue" @@ -294,7 +301,8 @@ class TestGoalManager: mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2) mgr.set("hard goal") - with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object(goals, "judge_goal_freeform", return_value=("continue", "not yet", False)): d1 = mgr.evaluate_after_turn("step 1") assert d1["should_continue"] is True assert mgr.state.turns_used == 1 @@ -434,9 +442,11 @@ class TestJudgeParseFailureAutoPause: mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20) mgr.set("do a thing") - with patch.object( - goals, "judge_goal", return_value=("continue", "judge returned empty response", True) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "judge returned empty response", True), + ): d1 = mgr.evaluate_after_turn("step 1") assert d1["should_continue"] is True assert mgr.state.consecutive_parse_failures == 1 @@ -463,17 +473,21 @@ class TestJudgeParseFailureAutoPause: mgr.set("another goal") # Two parse failures… - with patch.object( - goals, "judge_goal", return_value=("continue", "not json", True) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "not json", True), + ): mgr.evaluate_after_turn("step 1") mgr.evaluate_after_turn("step 2") assert mgr.state.consecutive_parse_failures == 2 # …then one clean reply resets the counter. - with patch.object( - goals, "judge_goal", return_value=("continue", "making progress", False) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "making progress", False), + ): d = mgr.evaluate_after_turn("step 3") assert d["should_continue"] is True assert mgr.state.consecutive_parse_failures == 0 @@ -486,9 +500,11 @@ class TestJudgeParseFailureAutoPause: mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20) mgr.set("goal") - with patch.object( - goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "judge error: RuntimeError", False), + ): for _ in range(5): d = mgr.evaluate_after_turn("still going") assert d["should_continue"] is True @@ -505,12 +521,553 @@ class TestJudgeParseFailureAutoPause: mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20) mgr.set("persistent goal") - with patch.object( - goals, "judge_goal", return_value=("continue", "empty", True) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "empty", True), + ): mgr.evaluate_after_turn("r") mgr.evaluate_after_turn("r") reloaded = load_goal("parse-fail-sid-4") assert reloaded is not None assert reloaded.consecutive_parse_failures == 2 + + +# ────────────────────────────────────────────────────────────────────── +# Checklist mode: GoalState backcompat + ChecklistItem +# ────────────────────────────────────────────────────────────────────── + + +class TestGoalStateBackcompat: + def test_old_state_meta_row_loads_without_checklist_fields(self): + """A goal serialized BEFORE the checklist fields existed must + round-trip through GoalState.from_json with empty defaults.""" + from hermes_cli.goals import GoalState + + legacy_json = json.dumps({ + "goal": "do the thing", + "status": "active", + "turns_used": 3, + "max_turns": 20, + "created_at": 1.0, + "last_turn_at": 2.0, + "last_verdict": "continue", + "last_reason": "still working", + "paused_reason": None, + "consecutive_parse_failures": 1, + }) + state = GoalState.from_json(legacy_json) + assert state.goal == "do the thing" + assert state.checklist == [] + assert state.decomposed is False + + def test_new_state_round_trip(self): + from hermes_cli.goals import ( + ChecklistItem, + GoalState, + ITEM_COMPLETED, + ITEM_PENDING, + ADDED_BY_JUDGE, + ADDED_BY_USER, + ) + + state = GoalState( + goal="g", + decomposed=True, + checklist=[ + ChecklistItem(text="a", status=ITEM_COMPLETED, + added_by=ADDED_BY_JUDGE, evidence="done"), + ChecklistItem(text="b", status=ITEM_PENDING, + added_by=ADDED_BY_USER), + ], + ) + round_tripped = GoalState.from_json(state.to_json()) + assert round_tripped.decomposed is True + assert len(round_tripped.checklist) == 2 + assert round_tripped.checklist[0].text == "a" + assert round_tripped.checklist[0].status == ITEM_COMPLETED + assert round_tripped.checklist[0].evidence == "done" + assert round_tripped.checklist[1].added_by == ADDED_BY_USER + + def test_checklist_counts_and_all_terminal(self): + from hermes_cli.goals import ( + ChecklistItem, GoalState, + ITEM_COMPLETED, ITEM_IMPOSSIBLE, ITEM_PENDING, + ) + + state = GoalState( + goal="g", + checklist=[ + ChecklistItem(text="a", status=ITEM_COMPLETED), + ChecklistItem(text="b", status=ITEM_IMPOSSIBLE), + ChecklistItem(text="c", status=ITEM_PENDING), + ], + ) + total, done, imp, pending = state.checklist_counts() + assert (total, done, imp, pending) == (3, 1, 1, 1) + assert state.all_terminal() is False + + state.checklist[2].status = ITEM_IMPOSSIBLE + assert state.all_terminal() is True + + def test_empty_checklist_is_not_all_terminal(self): + """Empty list must NOT be considered done.""" + from hermes_cli.goals import GoalState + + state = GoalState(goal="g") + assert state.all_terminal() is False + + +# ────────────────────────────────────────────────────────────────────── +# Phase A: decompose +# ────────────────────────────────────────────────────────────────────── + + +class TestPhaseADecompose: + def test_decompose_writes_checklist_and_marks_decomposed(self, hermes_home): + from hermes_cli import goals + from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_JUDGE + + mgr = GoalManager(session_id="phase-a-sid-1") + mgr.set("build a website") + + items = [{"text": "homepage exists"}, {"text": "is mobile-friendly"}] + with patch.object(goals, "decompose_goal", return_value=(items, None)): + d = mgr.evaluate_after_turn("(initial response)") + + assert d["verdict"] == "decompose" + assert d["should_continue"] is True + # Phase A produces a continuation prompt that includes the checklist. + assert d["continuation_prompt"] is not None + assert "Checklist progress" in d["continuation_prompt"] + assert mgr.state.decomposed is True + assert len(mgr.state.checklist) == 2 + assert mgr.state.checklist[0].text == "homepage exists" + assert mgr.state.checklist[0].status == ITEM_PENDING + assert mgr.state.checklist[0].added_by == ADDED_BY_JUDGE + + def test_decompose_only_runs_once(self, hermes_home): + """Decomposed=True after first call. Subsequent calls go to Phase B.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="phase-a-sid-2") + mgr.set("g") + + with patch.object( + goals, "decompose_goal", return_value=([{"text": "x"}], None) + ) as decompose_mock, patch.object( + goals, "evaluate_checklist", + return_value=({"updates": [], "new_items": [], "reason": "..."}, False), + ) as eval_mock: + mgr.evaluate_after_turn("turn 1") + mgr.evaluate_after_turn("turn 2") + mgr.evaluate_after_turn("turn 3") + + assert decompose_mock.call_count == 1 + assert eval_mock.call_count == 2 + + def test_decompose_failure_falls_back_to_freeform(self, hermes_home): + """If decompose returns no items, manager falls through to freeform judge.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="phase-a-sid-3") + mgr.set("g") + + with patch.object(goals, "decompose_goal", return_value=([], "model error")), \ + patch.object(goals, "judge_goal_freeform", + return_value=("done", "shipped", False)): + d = mgr.evaluate_after_turn("done!") + + assert d["verdict"] == "done" + assert mgr.state.decomposed is True + assert mgr.state.checklist == [] + + +# ────────────────────────────────────────────────────────────────────── +# Phase B: evaluate (checklist mode) +# ────────────────────────────────────────────────────────────────────── + + +class TestPhaseBChecklist: + def _make_decomposed_mgr(self, sid: str, items): + """Helper: skip Phase A, install a decomposed checklist directly.""" + from hermes_cli.goals import ( + GoalManager, ChecklistItem, ITEM_PENDING, ADDED_BY_JUDGE, + ) + from hermes_cli import goals as _g + mgr = GoalManager(session_id=sid) + mgr.set("a goal") + mgr.state.decomposed = True + mgr.state.checklist = [ + ChecklistItem(text=t, status=ITEM_PENDING, added_by=ADDED_BY_JUDGE) + for t in items + ] + _g.save_goal(sid, mgr.state) + return mgr + + def test_judge_flips_pending_to_completed(self, hermes_home): + from hermes_cli import goals + from hermes_cli.goals import ITEM_COMPLETED, ITEM_PENDING + + mgr = self._make_decomposed_mgr("phase-b-1", ["a", "b", "c"]) + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [ + {"index": 0, "status": "completed", "evidence": "done"}, + {"index": 1, "status": "completed", "evidence": "shipped"}, + ], + "new_items": [], + "reason": "made progress", + }, + False, + ), + ): + d = mgr.evaluate_after_turn("agent did stuff") + + assert d["verdict"] == "continue" + assert mgr.state.checklist[0].status == ITEM_COMPLETED + assert mgr.state.checklist[0].evidence == "done" + assert mgr.state.checklist[1].status == ITEM_COMPLETED + assert mgr.state.checklist[2].status == ITEM_PENDING + + def test_goal_done_when_all_items_terminal(self, hermes_home): + from hermes_cli import goals + + mgr = self._make_decomposed_mgr("phase-b-2", ["a", "b"]) + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [ + {"index": 0, "status": "completed", "evidence": "ok"}, + {"index": 1, "status": "impossible", "evidence": "blocked"}, + ], + "new_items": [], + "reason": "all done or blocked", + }, + False, + ), + ): + d = mgr.evaluate_after_turn("response") + + assert d["verdict"] == "done" + assert d["should_continue"] is False + assert mgr.state.status == "done" + + def test_stickiness_judge_cannot_regress_completed(self, hermes_home): + """Once an item is completed, judge updates trying to flip it back are ignored.""" + from hermes_cli import goals + from hermes_cli.goals import ITEM_COMPLETED + + mgr = self._make_decomposed_mgr("phase-b-stick", ["a"]) + # First turn completes item 0. + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [{"index": 0, "status": "completed", "evidence": "yes"}], + "new_items": [], + "reason": "done", + }, + False, + ), + ): + mgr.evaluate_after_turn("turn 1") + assert mgr.state.checklist[0].status == ITEM_COMPLETED + # Second turn: judge tries to send a non-terminal update. + # _parse_evaluate_response already filters non-terminal, but at the + # apply layer we also skip terminal items entirely. Smoke both. + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [{"index": 0, "status": "impossible", "evidence": "regress"}], + "new_items": [], + "reason": "trying to regress", + }, + False, + ), + ): + mgr.evaluate_after_turn("turn 2") + # Sticky: status stays completed, evidence unchanged. + assert mgr.state.checklist[0].status == ITEM_COMPLETED + assert mgr.state.checklist[0].evidence == "yes" + + def test_judge_appends_new_items(self, hermes_home): + from hermes_cli import goals + + mgr = self._make_decomposed_mgr("phase-b-new", ["a"]) + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [], + "new_items": [{"text": "newly discovered"}, {"text": "also this"}], + "reason": "found more work", + }, + False, + ), + ): + mgr.evaluate_after_turn("response") + assert len(mgr.state.checklist) == 3 + assert mgr.state.checklist[1].text == "newly discovered" + assert mgr.state.checklist[1].added_by == "judge" + + +# ────────────────────────────────────────────────────────────────────── +# /subgoal user controls +# ────────────────────────────────────────────────────────────────────── + + +class TestSubgoalUserControls: + def test_add_subgoal_appends_user_item(self, hermes_home): + from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_USER + + mgr = GoalManager(session_id="user-sid-1") + mgr.set("g") + item = mgr.add_subgoal("user added") + assert item.text == "user added" + assert item.status == ITEM_PENDING + assert item.added_by == ADDED_BY_USER + assert len(mgr.state.checklist) == 1 + + def test_add_subgoal_requires_active_goal(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-2") + with pytest.raises(RuntimeError): + mgr.add_subgoal("x") + + def test_add_subgoal_rejects_empty_text(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-3") + mgr.set("g") + with pytest.raises(ValueError): + mgr.add_subgoal(" ") + + def test_mark_subgoal_uses_1_based_index(self, hermes_home): + from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_IMPOSSIBLE + mgr = GoalManager(session_id="user-sid-4") + mgr.set("g") + mgr.add_subgoal("a") + mgr.add_subgoal("b") + mgr.add_subgoal("c") + mgr.mark_subgoal(2, "completed") + mgr.mark_subgoal(3, "impossible") + assert mgr.state.checklist[0].status == "pending" + assert mgr.state.checklist[1].status == ITEM_COMPLETED + assert mgr.state.checklist[2].status == ITEM_IMPOSSIBLE + + def test_mark_subgoal_rejects_invalid_index(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-5") + mgr.set("g") + mgr.add_subgoal("a") + with pytest.raises(IndexError): + mgr.mark_subgoal(5, "completed") + with pytest.raises(IndexError): + mgr.mark_subgoal(0, "completed") + + def test_user_can_revert_terminal_item(self, hermes_home): + """User mark_subgoal bypasses stickiness — only path to revert.""" + from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_PENDING + mgr = GoalManager(session_id="user-sid-6") + mgr.set("g") + mgr.add_subgoal("a") + mgr.mark_subgoal(1, "completed") + assert mgr.state.checklist[0].status == ITEM_COMPLETED + mgr.mark_subgoal(1, "pending") + assert mgr.state.checklist[0].status == ITEM_PENDING + + def test_remove_subgoal(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-7") + mgr.set("g") + mgr.add_subgoal("a") + mgr.add_subgoal("b") + mgr.add_subgoal("c") + removed = mgr.remove_subgoal(2) + assert removed.text == "b" + assert [it.text for it in mgr.state.checklist] == ["a", "c"] + + def test_clear_checklist_resets_decomposed(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-8") + mgr.set("g") + mgr.state.decomposed = True + mgr.add_subgoal("a") + mgr.clear_checklist() + assert mgr.state.checklist == [] + assert mgr.state.decomposed is False + + +# ────────────────────────────────────────────────────────────────────── +# Conversation dump +# ────────────────────────────────────────────────────────────────────── + + +class TestConversationDump: + def test_dump_writes_messages_to_goals_dir(self, hermes_home): + from hermes_cli.goals import dump_conversation, conversation_dump_path + + msgs = [ + {"role": "user", "content": "hi"}, + {"role": "assistant", "content": "hello"}, + ] + path = dump_conversation("dump-sid-1", msgs) + assert path is not None + assert path.exists() + # Path is under /goals/.json + assert path.parent.name == "goals" + assert path.name == "dump-sid-1.json" + + loaded = json.loads(path.read_text()) + assert loaded == msgs + + # conversation_dump_path returns the same path + assert conversation_dump_path("dump-sid-1") == path + + def test_dump_handles_unsafe_session_id(self, hermes_home): + from hermes_cli.goals import dump_conversation + + path = dump_conversation("evil/../../sid", [{"role": "user", "content": "x"}]) + assert path is not None + # No traversal — slashes are normalized to underscores. (Periods are + # preserved because they're legitimate in filenames; the resulting + # name still cannot escape /goals/ since path + # separators are gone.) + assert "/" not in path.name + assert path.parent.name == "goals" + # Verify the resolved path stays under the goals dir. + from hermes_cli.goals import _goals_dump_dir + goals_dir = _goals_dump_dir().resolve() + assert str(path.resolve()).startswith(str(goals_dir)) + + def test_dump_skips_when_messages_empty(self, hermes_home): + from hermes_cli.goals import dump_conversation + assert dump_conversation("sid", []) is None + assert dump_conversation("", [{"role": "user", "content": "x"}]) is None + + +# ────────────────────────────────────────────────────────────────────── +# Judge read_file tool: path restriction +# ────────────────────────────────────────────────────────────────────── + + +class TestJudgeReadFile: + def test_restricted_to_allowed_path(self, hermes_home, tmp_path): + from hermes_cli.goals import _judge_read_file + + allowed = tmp_path / "allowed.json" + allowed.write_text("hello\nworld\n") + + ok = _judge_read_file(str(allowed), allowed_path=allowed) + loaded = json.loads(ok) + assert loaded["content"].startswith("hello") + + # Try to read a different file. + sneaky = tmp_path / "secret.txt" + sneaky.write_text("nope\n") + denied = _judge_read_file(str(sneaky), allowed_path=allowed) + loaded = json.loads(denied) + assert "error" in loaded + assert "restricted" in loaded["error"] + + def test_pagination(self, hermes_home, tmp_path): + from hermes_cli.goals import _judge_read_file + f = tmp_path / "big.json" + f.write_text("\n".join(f"line-{i}" for i in range(50)) + "\n") + + # offset=10, limit=5 should return lines 10..14. + result = json.loads(_judge_read_file(str(f), offset=10, limit=5, allowed_path=f)) + assert result["returned"] == 5 + assert "line-9" in result["content"] # 1-based: line 10 == zero-indexed 9 + assert result["next_offset"] == 15 + + +# ────────────────────────────────────────────────────────────────────── +# Index conversion: judge emits 1-based, apply layer uses 0-based +# ────────────────────────────────────────────────────────────────────── + + +class TestJudgeIndexConversion: + def test_parse_evaluate_converts_1based_to_0based(self): + """The judge sees the checklist with 1-based indices (rendered as + '1. [ ] foo, 2. [ ] bar'). It emits updates with those same indices. + ``_parse_evaluate_response`` must convert them to 0-based so the + apply layer can index ``state.checklist`` directly. + """ + from hermes_cli.goals import _parse_evaluate_response + + raw = ''' + {"updates": [ + {"index": 1, "status": "completed", "evidence": "first item"}, + {"index": 3, "status": "impossible", "evidence": "third item"} + ], + "new_items": [], + "reason": "evaluated"} + ''' + parsed, parse_failed = _parse_evaluate_response(raw) + assert parse_failed is False + # 1 → 0, 3 → 2 + assert [u["index"] for u in parsed["updates"]] == [0, 2] + assert parsed["updates"][0]["evidence"] == "first item" + assert parsed["updates"][1]["status"] == "impossible" + + def test_full_round_trip_judge_index_to_state(self, hermes_home): + """End-to-end: judge emits 1-based, parser converts, apply layer + flips the right items in state.checklist.""" + from hermes_cli import goals + from hermes_cli.goals import ( + GoalManager, ChecklistItem, ITEM_PENDING, ITEM_COMPLETED, + ADDED_BY_JUDGE, + ) + + mgr = GoalManager(session_id="idx-round-trip") + mgr.set("g") + mgr.state.decomposed = True + mgr.state.checklist = [ + ChecklistItem(text="first", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE), + ChecklistItem(text="second", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE), + ChecklistItem(text="third", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE), + ] + goals.save_goal("idx-round-trip", mgr.state) + + # Simulate the judge returning a raw-JSON Phase-B reply via the + # auxiliary client: the parser handles the 1-based → 0-based + # conversion so the apply layer flips item 1 (text="first"). + class FakeMessage: + content = ''' + {"updates": [{"index": 1, "status": "completed", "evidence": "first done"}], + "new_items": [], + "reason": "..."} + ''' + tool_calls = None + + class FakeChoice: + message = FakeMessage() + + class FakeResponse: + choices = [FakeChoice()] + + class FakeClient: + class chat: + class completions: + @staticmethod + def create(**kwargs): + return FakeResponse() + + with patch.object(goals, "_get_judge_client", return_value=(FakeClient, "fake-model")): + mgr.evaluate_after_turn("ran the script and item 1 is done") + + # Item 1 (text="first") should now be completed. + assert mgr.state.checklist[0].text == "first" + assert mgr.state.checklist[0].status == ITEM_COMPLETED + assert mgr.state.checklist[0].evidence == "first done" + # Other items still pending. + assert mgr.state.checklist[1].status == ITEM_PENDING + assert mgr.state.checklist[2].status == ITEM_PENDING diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 73b2fc1a811..07febffaf98 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -3251,6 +3251,7 @@ def _run_prompt_submit(rid, sid: str, session: dict, text: Any) -> None: decision = goal_mgr.evaluate_after_turn( raw, user_initiated=True, + messages=list(session.get("history") or []), ) verdict_msg = decision.get("message") or "" if verdict_msg: