diff --git a/cli.py b/cli.py index b3b6a73ffe2..e54acc1c76c 100644 --- a/cli.py +++ b/cli.py @@ -7244,6 +7244,8 @@ class HermesCLI: _cprint(f" No agent running; queued as next turn: {payload[:80]}{'...' if len(payload) > 80 else ''}") elif canonical == "goal": self._handle_goal_command(cmd_original) + elif canonical == "subgoal": + self._handle_subgoal_command(cmd_original) elif canonical == "skin": self._handle_skin_command(cmd_original) elif canonical == "voice": @@ -7840,6 +7842,103 @@ class HermesCLI: except Exception: pass + def _handle_subgoal_command(self, cmd: str) -> None: + """Dispatch /subgoal subcommands. + + Forms: + /subgoal show the checklist + /subgoal append a user item + /subgoal complete mark item n completed + /subgoal impossible mark item n impossible + /subgoal undo revert item n to pending + /subgoal remove delete item n + /subgoal clear wipe the checklist (judge re-decomposes) + """ + parts = (cmd or "").strip().split(None, 2) + # parts[0] == "/subgoal"; remainder is what the user typed + arg = " ".join(parts[1:]).strip() if len(parts) > 1 else "" + + mgr = self._get_goal_manager() + if mgr is None: + _cprint(f" {_DIM}Goals unavailable (no active session).{_RST}") + return + + if not mgr.has_goal(): + _cprint(f" {_DIM}No active goal. Set one with /goal .{_RST}") + return + + # No args → show the checklist. + if not arg: + _cprint(f" {mgr.status_line()}") + _cprint(f" {mgr.render_checklist()}") + return + + tokens = arg.split(None, 1) + verb = tokens[0].lower() + rest = tokens[1].strip() if len(tokens) > 1 else "" + + # Action verbs operate on indices. + action_status_map = { + "complete": "completed", + "completed": "completed", + "done": "completed", + "impossible": "impossible", + "imp": "impossible", + "skip": "impossible", + "undo": "pending", + "pending": "pending", + "reset": "pending", + } + if verb in action_status_map: + if not rest: + _cprint(f" Usage: /subgoal {verb} ") + return + try: + idx = int(rest.split()[0]) + except ValueError: + _cprint(f" /subgoal {verb}: must be an integer (1-based index).") + return + try: + item = mgr.mark_subgoal(idx, action_status_map[verb]) + except (IndexError, ValueError, RuntimeError) as exc: + _cprint(f" /subgoal {verb}: {exc}") + return + _cprint(f" ✓ Item {idx} → {item.status}: {item.text}") + return + + if verb == "remove": + if not rest: + _cprint(" Usage: /subgoal remove ") + return + try: + idx = int(rest.split()[0]) + except ValueError: + _cprint(" /subgoal remove: must be an integer (1-based index).") + return + try: + removed = mgr.remove_subgoal(idx) + except (IndexError, RuntimeError) as exc: + _cprint(f" /subgoal remove: {exc}") + return + _cprint(f" ✓ Removed item {idx}: {removed.text}") + return + + if verb == "clear": + mgr.clear_checklist() + _cprint( + " ✓ Checklist cleared. The judge will re-decompose on the next turn." + ) + return + + # Otherwise: append `arg` as a user-authored checklist item. + try: + item = mgr.add_subgoal(arg) + except (ValueError, RuntimeError) as exc: + _cprint(f" /subgoal: {exc}") + return + idx = len(mgr.state.checklist) if mgr.state else 0 + _cprint(f" ✓ Added subgoal {idx}: {item.text}") + def _maybe_continue_goal_after_turn(self) -> None: """Hook run after every CLI turn. Judges + maybe re-queues. @@ -7917,7 +8016,11 @@ class HermesCLI: if not last_response.strip(): return - decision = mgr.evaluate_after_turn(last_response, user_initiated=True) + decision = mgr.evaluate_after_turn( + last_response, + user_initiated=True, + messages=getattr(self, "conversation_history", None) or [], + ) msg = decision.get("message") or "" if msg: _cprint(f" {msg}") diff --git a/gateway/run.py b/gateway/run.py index 129827cbfb6..358e3702489 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -6063,6 +6063,12 @@ class GatewayRunner: return await self._handle_goal_command(event) return "Agent is running — use /goal status / pause / clear mid-run, or /stop before setting a new goal." + # /subgoal is safe mid-run — it only modifies the active goal's + # checklist, which the judge consults at turn boundaries. There + # is no race with the running turn. + if _cmd_def_inner and _cmd_def_inner.name == "subgoal": + return await self._handle_subgoal_command(event) + # Session-level toggles that are safe to run mid-agent — # /yolo can unblock a pending approval prompt, /verbose cycles # the tool-progress display mode for the ongoing stream. @@ -6442,6 +6448,9 @@ class GatewayRunner: if canonical == "goal": return await self._handle_goal_command(event) + if canonical == "subgoal": + return await self._handle_subgoal_command(event) + if canonical == "voice": return await self._handle_voice_command(event) @@ -6616,10 +6625,18 @@ class GatewayRunner: except Exception: session_entry = None if session_entry is not None: + # Pull the agent's full messages list from the result + # so the judge can dump it for its read_file tool. + _agent_messages: list = [] + if isinstance(_agent_result, dict): + _msgs = _agent_result.get("messages") + if isinstance(_msgs, list): + _agent_messages = _msgs await self._post_turn_goal_continuation( session_entry=session_entry, source=source, final_response=_final_text, + agent_messages=_agent_messages, ) except Exception as _goal_exc: logger.debug("goal continuation hook failed: %s", _goal_exc) @@ -9335,6 +9352,83 @@ class GatewayRunner: return t("gateway.goal.set", budget=state.max_turns, goal=state.goal) + async def _handle_subgoal_command(self, event: "MessageEvent") -> str: + """Handle /subgoal for gateway platforms. + + Forms (mirror of CLI): + /subgoal show the checklist + /subgoal append a user item + /subgoal complete | done mark item n completed + /subgoal impossible mark item n impossible + /subgoal undo revert item n to pending + /subgoal remove delete item n + /subgoal clear wipe the checklist + """ + args = (event.get_command_args() or "").strip() + + mgr, _session_entry = self._get_goal_manager_for_event(event) + if mgr is None: + return t("gateway.goal.unavailable") + + if not mgr.has_goal(): + return "No active goal. Set one with /goal ." + + if not args: + return f"{mgr.status_line()}\n{mgr.render_checklist()}" + + tokens = args.split(None, 1) + verb = tokens[0].lower() + rest = tokens[1].strip() if len(tokens) > 1 else "" + + action_status_map = { + "complete": "completed", + "completed": "completed", + "done": "completed", + "impossible": "impossible", + "imp": "impossible", + "skip": "impossible", + "undo": "pending", + "pending": "pending", + "reset": "pending", + } + if verb in action_status_map: + if not rest: + return f"Usage: /subgoal {verb} " + try: + idx = int(rest.split()[0]) + except ValueError: + return f"/subgoal {verb}: must be an integer (1-based index)." + try: + item = mgr.mark_subgoal(idx, action_status_map[verb]) + except (IndexError, ValueError, RuntimeError) as exc: + return f"/subgoal {verb}: {exc}" + return f"✓ Item {idx} → {item.status}: {item.text}" + + if verb == "remove": + if not rest: + return "Usage: /subgoal remove " + try: + idx = int(rest.split()[0]) + except ValueError: + return "/subgoal remove: must be an integer (1-based index)." + try: + removed = mgr.remove_subgoal(idx) + except (IndexError, RuntimeError) as exc: + return f"/subgoal remove: {exc}" + return f"✓ Removed item {idx}: {removed.text}" + + if verb == "clear": + mgr.clear_checklist() + return "✓ Checklist cleared. The judge will re-decompose on the next turn." + + # Otherwise — append `args` as a new user-authored checklist item. + try: + item = mgr.add_subgoal(args) + except (ValueError, RuntimeError) as exc: + return f"/subgoal: {exc}" + idx = len(mgr.state.checklist) if mgr.state else 0 + return f"✓ Added subgoal {idx}: {item.text}" + async def _send_goal_status_notice(self, source: Any, message: str) -> None: """Send a /goal judge status line back to the originating chat/thread.""" adapter = self.adapters.get(source.platform) @@ -9403,6 +9497,7 @@ class GatewayRunner: session_entry: Any, source: Any, final_response: str, + agent_messages: Optional[list] = None, ) -> None: """Run the goal judge after a gateway turn and, if still active, enqueue a continuation prompt for the same session. @@ -9430,7 +9525,11 @@ class GatewayRunner: if not mgr.is_active(): return - decision = mgr.evaluate_after_turn(final_response or "", user_initiated=True) + decision = mgr.evaluate_after_turn( + final_response or "", + user_initiated=True, + messages=agent_messages or [], + ) msg = decision.get("message") or "" # Defer the status line until after the adapter has delivered the diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index 3f543bba647..e2727c25bab 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -104,6 +104,8 @@ COMMAND_REGISTRY: list[CommandDef] = [ args_hint=""), CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session", args_hint="[text | pause | resume | clear | status]"), + CommandDef("subgoal", "Add or manage checklist items on the active goal", "Session", + args_hint="[text | complete N | impossible N | undo N | remove N | clear]"), CommandDef("status", "Show session info", "Session"), CommandDef("whoami", "Show your slash command access (admin / user)", "Info"), CommandDef("profile", "Show active profile name and home directory", "Info"), diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py index 894cdddb01b..4f54e74f66f 100644 --- a/hermes_cli/goals.py +++ b/hermes_cli/goals.py @@ -7,6 +7,18 @@ continuation prompt back into the same session and keeps working until the goal is done, turn budget is exhausted, the user pauses/clears it, or the user sends a new message (which takes priority and pauses the goal loop). +Checklist mode (added 2026-05): when a goal is set, a Phase-A "decompose" +call asks the judge to write an extremely detailed checklist of concrete +completion criteria for that goal. On every subsequent turn (Phase B) the +judge evaluates the agent's most recent output against EACH pending item +and may flip pending → completed | impossible, or append new items it +discovers along the way. The goal is done only when every checklist item +is in a terminal status. This is much harsher than the freeform +"is the goal done?" prompt and gives users a visible, verifiable progress +surface via /subgoal. A bounded read_file tool loop lets the judge inspect +the dumped conversation history when the snippet alone isn't enough to +rule. + State is persisted in SessionDB's ``state_meta`` table keyed by ``goal:`` so ``/resume`` picks it up. @@ -21,6 +33,9 @@ Design notes / invariants: prompt and also pauses the goal loop for that turn (we still re-judge after, so if the user's message happens to complete the goal the judge will say ``done``). +- Stickiness: once an item is marked completed or impossible, only the user + (via /subgoal undo) can flip it back. Judge updates that try to regress + terminal items are silently ignored. - This module has zero hard dependency on ``cli.HermesCLI`` or the gateway runner — both wire the same ``GoalManager`` in. @@ -31,10 +46,12 @@ from __future__ import annotations import json import logging +import os import re import time -from dataclasses import dataclass, asdict -from typing import Any, Dict, Optional, Tuple +from dataclasses import dataclass, field, asdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) @@ -44,8 +61,9 @@ logger = logging.getLogger(__name__) # ────────────────────────────────────────────────────────────────────── DEFAULT_MAX_TURNS = 20 -DEFAULT_JUDGE_TIMEOUT = 30.0 -# Cap how much of the last response + recent messages we send to the judge. +DEFAULT_JUDGE_TIMEOUT = 60.0 +# Cap how much of the last response we send to the judge inline. The judge +# can read the dumped conversation file via read_file if it needs more. _JUDGE_RESPONSE_SNIPPET_CHARS = 4000 # After this many consecutive judge *parse* failures (empty output / non-JSON), # the loop auto-pauses and points the user at the goal_judge config. API / @@ -55,8 +73,36 @@ _JUDGE_RESPONSE_SNIPPET_CHARS = 4000 # exhausted with every reply shaped like `judge returned empty response` or # `judge reply was not JSON`. DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3 +# Bound the Phase-B judge tool loop: if the judge keeps calling read_file +# without ever emitting a verdict, cap it so we don't burn the model's budget. +DEFAULT_MAX_JUDGE_TOOL_CALLS = 5 +# Cap a single read_file response so a judge that tries to read 100k lines +# doesn't blow up its own context. Judge can paginate if needed. +_JUDGE_READ_FILE_MAX_LINES = 400 +_JUDGE_READ_FILE_MAX_CHARS = 32_000 +# Status constants ──────────────────────────────────────────────────── +ITEM_PENDING = "pending" +ITEM_COMPLETED = "completed" +ITEM_IMPOSSIBLE = "impossible" +TERMINAL_ITEM_STATUSES = frozenset({ITEM_COMPLETED, ITEM_IMPOSSIBLE}) +VALID_ITEM_STATUSES = frozenset({ITEM_PENDING, ITEM_COMPLETED, ITEM_IMPOSSIBLE}) + +ITEM_MARKERS = { + ITEM_COMPLETED: "[x]", + ITEM_IMPOSSIBLE: "[!]", + ITEM_PENDING: "[ ]", +} + +ADDED_BY_JUDGE = "judge" +ADDED_BY_USER = "user" + + +# ────────────────────────────────────────────────────────────────────── +# Continuation prompt +# ────────────────────────────────────────────────────────────────────── + CONTINUATION_PROMPT_TEMPLATE = ( "[Continuing toward your standing goal]\n" "Goal: {goal}\n\n" @@ -65,8 +111,56 @@ CONTINUATION_PROMPT_TEMPLATE = ( "If you are blocked and need input from the user, say so clearly and stop." ) +CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE = ( + "[Continuing toward your standing goal]\n" + "Goal: {goal}\n\n" + "Checklist progress ({done}/{total} done):\n" + "{checklist}\n\n" + "Work on the unchecked items above. Do not declare items done yourself " + "— a judge marks them based on evidence in your output. If an item is " + "genuinely impossible in this environment, explain why so the judge can " + "mark it impossible. If you are blocked on a remaining item and need " + "user input, say so clearly and stop." +) -JUDGE_SYSTEM_PROMPT = ( + +# ────────────────────────────────────────────────────────────────────── +# Phase-A: decompose prompts +# ────────────────────────────────────────────────────────────────────── + +DECOMPOSE_SYSTEM_PROMPT = ( + "You are a strict judge for an autonomous agent. Your first job, before " + "judging anything, is to break the user's stated goal into an EXTREMELY " + "DETAILED checklist of concrete, verifiable completion criteria. Each " + "item must be specific enough that a third party reading the agent's " + "output could decide unambiguously whether that item was achieved.\n\n" + "Be exhaustive. Bias toward MORE items, not fewer. Include sub-items, " + "edge cases, quality bars, deployment steps, verification checks, and " + "anything the user would reasonably expect from a goal of this type. " + "If the user said 'build me a website' you should be enumerating " + "homepage exists, navigation links work, content is non-placeholder, " + "mobile responsive, accessibility tags present, deployed somewhere " + "publicly accessible, domain/URL is functional, etc. Better to " + "over-specify and let a few items get marked impossible than to " + "under-specify and let the agent declare victory early.\n\n" + "Reply ONLY with a single JSON object on one line:\n" + '{"checklist": [{"text": ""}, {"text": ""}, ...]}' +) + +DECOMPOSE_USER_PROMPT_TEMPLATE = ( + "Goal:\n{goal}\n\n" + "Produce the harshest, most detailed checklist of completion criteria " + "you can. Aim for at least 5 items; more is better when warranted. " + "Each item should be a single verifiable statement of fact about the " + "finished work." +) + + +# ────────────────────────────────────────────────────────────────────── +# Phase-B: evaluate prompts +# ────────────────────────────────────────────────────────────────────── + +EVALUATE_SYSTEM_PROMPT_FREEFORM = ( "You are a strict judge evaluating whether an autonomous agent has " "achieved a user's stated goal. You receive the goal text and the " "agent's most recent response. Your only job is to decide whether " @@ -78,11 +172,60 @@ JUDGE_SYSTEM_PROMPT = ( "user input (treat this as DONE with reason describing the block).\n\n" "Otherwise the goal is NOT done — CONTINUE.\n\n" "Reply ONLY with a single JSON object on one line:\n" - '{\"done\": , \"reason\": \"\"}' + '{"done": , "reason": ""}' ) +EVALUATE_SYSTEM_PROMPT_CHECKLIST = ( + "You are a strict judge evaluating an autonomous agent's progress on " + "a user's goal that has a detailed checklist of completion criteria. " + "For EACH currently-pending checklist item, decide whether the " + "available evidence shows the item is satisfied.\n\n" + "Be strict but not absurd. Default to leaving items pending UNLESS " + "evidence is reasonably clear. Reasonable evidence includes:\n" + "- The agent's most recent response describing or showing the work\n" + "- Tool call results visible in the conversation history (file writes, " + "command output, web requests, etc.)\n" + "- A clear statement by the agent that the work was done, when " + "supported by tool output earlier in the conversation\n\n" + "Do NOT require the agent to re-prove items it has already established " + "in earlier turns. If a tool call earlier in the conversation already " + "wrote a file, you do not need fresh `ls` output every turn — once " + "established, it's done.\n\n" + "Flip pending → completed when the response or recent tool calls show " + "the item is satisfied. Flip pending → impossible only when the work " + "demonstrates the item cannot be achieved in this environment (NOT " + "merely that the agent didn't try). Vague intentions ('I will do X " + "next') do NOT count as completion.\n\n" + "You may APPEND new checklist items if the agent's work reveals " + "criteria the original decomposition missed. Stay strict — only add " + "items that genuinely belong as completion criteria.\n\n" + "STICKINESS: items already marked completed or impossible are frozen. " + "Do not include them in your updates. Only the user can revert them.\n\n" + "TOOLS: you have read_file(path, offset, limit) available. The full " + "conversation history for this session is dumped to a JSON file whose " + "path is given in the user message. Call read_file on it when the " + "snippet is ambiguous, when you need to verify a tool call actually " + "happened, or when you want to see what skills the agent loaded. " + "Otherwise, judge from the snippet directly — extra reads cost tokens.\n\n" + "When you are ready to rule, reply ONLY with a single JSON object:\n" + '{"updates": [{"index": , "status": "completed|impossible", "evidence": ""}, ...], ' + '"new_items": [{"text": ""}, ...], ' + '"reason": ""}\n' + "When citing evidence, reference the agent's actual output specifically. " + "Empty updates is fine. Empty new_items is fine. The reason field is required." +) -JUDGE_USER_PROMPT_TEMPLATE = ( +EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE = ( + "Goal:\n{goal}\n\n" + "Current checklist (each item is numbered, 1-based — use these " + "exact 1-based numbers as the ``index`` field in your updates):\n{checklist_block}\n\n" + "Agent's most recent response (snippet):\n{response}\n\n" + "Conversation history file (call read_file on this path if you need " + "more context — pagination supported via offset/limit):\n{history_path}\n\n" + "Evaluate each pending item. Cite specific evidence." +) + +EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE = ( "Goal:\n{goal}\n\n" "Agent's most recent response:\n{response}\n\n" "Is the goal satisfied?" @@ -90,16 +233,55 @@ JUDGE_USER_PROMPT_TEMPLATE = ( # ────────────────────────────────────────────────────────────────────── -# Dataclass +# Dataclasses # ────────────────────────────────────────────────────────────────────── +@dataclass +class ChecklistItem: + """One concrete completion criterion attached to a goal.""" + + text: str + status: str = ITEM_PENDING # pending | completed | impossible + added_by: str = ADDED_BY_JUDGE # judge | user + added_at: float = 0.0 + completed_at: Optional[float] = None + evidence: Optional[str] = None # judge's rationale on flip + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "ChecklistItem": + text = str(data.get("text", "")).strip() + if not text: + text = "(empty item)" + status = str(data.get("status", ITEM_PENDING)).strip().lower() + if status not in VALID_ITEM_STATUSES: + status = ITEM_PENDING + added_by = str(data.get("added_by", ADDED_BY_JUDGE)).strip().lower() + if added_by not in (ADDED_BY_JUDGE, ADDED_BY_USER): + added_by = ADDED_BY_JUDGE + return cls( + text=text, + status=status, + added_by=added_by, + added_at=float(data.get("added_at", 0.0) or 0.0), + completed_at=( + float(data["completed_at"]) + if data.get("completed_at") is not None + else None + ), + evidence=data.get("evidence"), + ) + + @dataclass class GoalState: """Serializable goal state stored per session.""" goal: str - status: str = "active" # active | paused | done | cleared + status: str = "active" # active | paused | done | cleared turns_used: int = 0 max_turns: int = DEFAULT_MAX_TURNS created_at: float = 0.0 @@ -108,13 +290,28 @@ class GoalState: last_reason: Optional[str] = None paused_reason: Optional[str] = None # why we auto-paused (budget, etc.) consecutive_parse_failures: int = 0 # judge-output parse failures in a row + # Checklist mode (added 2026-05). Both fields default safely so old + # state_meta rows load unchanged. + checklist: List[ChecklistItem] = field(default_factory=list) + decomposed: bool = False # has Phase-A run for this goal? def to_json(self) -> str: - return json.dumps(asdict(self), ensure_ascii=False) + data = asdict(self) + # asdict already serializes ChecklistItem via dataclass recursion. + return json.dumps(data, ensure_ascii=False) @classmethod def from_json(cls, raw: str) -> "GoalState": data = json.loads(raw) + raw_checklist = data.get("checklist") or [] + checklist: List[ChecklistItem] = [] + if isinstance(raw_checklist, list): + for item in raw_checklist: + if isinstance(item, dict): + try: + checklist.append(ChecklistItem.from_dict(item)) + except Exception: + continue return cls( goal=data.get("goal", ""), status=data.get("status", "active"), @@ -126,8 +323,39 @@ class GoalState: last_reason=data.get("last_reason"), paused_reason=data.get("paused_reason"), consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0), + checklist=checklist, + decomposed=bool(data.get("decomposed", False)), ) + # --- checklist helpers ------------------------------------------------ + + def checklist_counts(self) -> Tuple[int, int, int, int]: + """Return (total, completed, impossible, pending).""" + total = len(self.checklist) + completed = sum(1 for it in self.checklist if it.status == ITEM_COMPLETED) + impossible = sum(1 for it in self.checklist if it.status == ITEM_IMPOSSIBLE) + pending = total - completed - impossible + return total, completed, impossible, pending + + def all_terminal(self) -> bool: + """True iff at least one item exists and every item is in a terminal status.""" + if not self.checklist: + return False + return all(it.status in TERMINAL_ITEM_STATUSES for it in self.checklist) + + def render_checklist(self, *, numbered: bool = False) -> str: + if not self.checklist: + return "(empty)" + lines = [] + for i, item in enumerate(self.checklist, start=1): + marker = ITEM_MARKERS.get(item.status, "[?]") + prefix = f"{i}. {marker}" if numbered else f" {marker}" + line = f"{prefix} {item.text}" + if item.status == ITEM_IMPOSSIBLE and item.evidence: + line += f" (impossible: {item.evidence})" + lines.append(line) + return "\n".join(lines) + # ────────────────────────────────────────────────────────────────────── # Persistence (SessionDB state_meta) @@ -215,7 +443,63 @@ def clear_goal(session_id: str) -> None: # ────────────────────────────────────────────────────────────────────── -# Judge +# Conversation-history dump (read by the judge tool loop) +# ────────────────────────────────────────────────────────────────────── + + +def _goals_dump_dir() -> Optional[Path]: + """Return ``/goals`` (created on first use), or None on error.""" + try: + from hermes_constants import get_hermes_home + + home = Path(get_hermes_home()) + except Exception as exc: + logger.debug("goals dump dir: get_hermes_home failed: %s", exc) + return None + try: + path = home / "goals" + path.mkdir(parents=True, exist_ok=True) + return path + except Exception as exc: + logger.debug("goals dump dir: mkdir failed: %s", exc) + return None + + +def _safe_session_filename(session_id: str) -> str: + """Make a session_id safe for use as a filename component.""" + cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", session_id or "unknown") + # Bound length to keep filesystem happy. + return cleaned[:128] or "unknown" + + +def conversation_dump_path(session_id: str) -> Optional[Path]: + """Where the dumped messages JSON for ``session_id`` lives.""" + base = _goals_dump_dir() + if base is None: + return None + return base / f"{_safe_session_filename(session_id)}.json" + + +def dump_conversation(session_id: str, messages: List[Dict[str, Any]]) -> Optional[Path]: + """Write ``messages`` to the goals/ dump file. Returns the path on success.""" + if not session_id or not messages: + return None + path = conversation_dump_path(session_id) + if path is None: + return None + try: + # Best-effort: messages may contain non-JSON-serializable objects from + # provider-specific adapter shims. Fall through with default=str. + with open(path, "w", encoding="utf-8") as fh: + json.dump(messages, fh, ensure_ascii=False, indent=2, default=str) + return path + except Exception as exc: + logger.debug("dump_conversation: write failed: %s", exc) + return None + + +# ────────────────────────────────────────────────────────────────────── +# Judge: parsing helpers # ────────────────────────────────────────────────────────────────────── @@ -227,11 +511,34 @@ def _truncate(text: str, limit: int) -> str: return text[:limit] + "… [truncated]" -_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL) +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _extract_json_object(raw: str) -> Optional[Dict[str, Any]]: + """Best-effort extraction of a single JSON object from a possibly-prosey reply.""" + if not raw: + return None + text = raw.strip() + if text.startswith("```"): + text = text.strip("`") + nl = text.find("\n") + if nl != -1: + text = text[nl + 1:] + try: + data = json.loads(text) + except Exception: + match = _JSON_OBJECT_RE.search(text) + if not match: + return None + try: + data = json.loads(match.group(0)) + except Exception: + return None + return data if isinstance(data, dict) else None def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: - """Parse the judge's reply. Fail-open to ``(False, "", parse_failed)``. + """Parse the freeform judge's reply. Fail-open to ``(False, "", parse_failed)``. Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the judge returned output that couldn't be interpreted as the expected JSON @@ -242,30 +549,8 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: if not raw: return False, "judge returned empty response", True - text = raw.strip() - - # Strip markdown code fences the model may wrap JSON in. - if text.startswith("```"): - text = text.strip("`") - # Peel off leading json/JSON/etc tag - nl = text.find("\n") - if nl != -1: - text = text[nl + 1:] - - # First try: parse the whole blob. - data: Optional[Dict[str, Any]] = None - try: - data = json.loads(text) - except Exception: - # Second try: pull the first JSON object out. - match = _JSON_OBJECT_RE.search(text) - if match: - try: - data = json.loads(match.group(0)) - except Exception: - data = None - - if not isinstance(data, dict): + data = _extract_json_object(raw) + if data is None: return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True done_val = data.get("done") @@ -279,49 +564,286 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: return done, reason, False -def judge_goal( +def _parse_decompose_response(raw: str) -> Tuple[List[Dict[str, Any]], bool]: + """Parse a Phase-A decompose reply. Returns (items, parse_failed).""" + if not raw: + return [], True + data = _extract_json_object(raw) + if data is None: + return [], True + raw_items = data.get("checklist") + if not isinstance(raw_items, list): + return [], True + out: List[Dict[str, Any]] = [] + for item in raw_items: + if isinstance(item, dict): + text = str(item.get("text", "")).strip() + if text: + out.append({"text": text}) + elif isinstance(item, str): + text = item.strip() + if text: + out.append({"text": text}) + return out, False + + +def _parse_evaluate_response(raw: str) -> Tuple[Dict[str, Any], bool]: + """Parse a Phase-B checklist eval reply. Returns (parsed, parse_failed). + + parsed = {"updates": [...], "new_items": [...], "reason": str} + """ + if not raw: + return {"updates": [], "new_items": [], "reason": "judge returned empty response"}, True + data = _extract_json_object(raw) + if data is None: + return ( + { + "updates": [], + "new_items": [], + "reason": f"judge reply was not JSON: {_truncate(raw, 200)!r}", + }, + True, + ) + updates = data.get("updates") or [] + new_items = data.get("new_items") or [] + reason = str(data.get("reason") or "").strip() or "no reason provided" + norm_updates = [] + if isinstance(updates, list): + for upd in updates: + if not isinstance(upd, dict): + continue + try: + # Judge sees the checklist rendered with 1-based indices + # (matches the /subgoal CLI). Convert to 0-based here so the + # apply layer can index ``state.checklist`` directly. + idx_1based = int(upd.get("index")) + except (TypeError, ValueError): + continue + idx = idx_1based - 1 + status = str(upd.get("status", "")).strip().lower() + if status not in TERMINAL_ITEM_STATUSES: + # Phase-B only accepts terminal flips. Pending → pending is a no-op. + continue + evidence = str(upd.get("evidence") or "").strip() or None + norm_updates.append({"index": idx, "status": status, "evidence": evidence}) + norm_new = [] + if isinstance(new_items, list): + for it in new_items: + if isinstance(it, dict): + text = str(it.get("text", "")).strip() + if text: + norm_new.append({"text": text}) + elif isinstance(it, str): + text = it.strip() + if text: + norm_new.append({"text": text}) + return {"updates": norm_updates, "new_items": norm_new, "reason": reason}, False + + +# ────────────────────────────────────────────────────────────────────── +# Judge: read_file tool for the judge's bounded tool loop +# ────────────────────────────────────────────────────────────────────── + + +_JUDGE_READ_FILE_TOOL_SCHEMA: Dict[str, Any] = { + "type": "function", + "function": { + "name": "read_file", + "description": ( + "Read a portion of the dumped conversation history JSON file. " + "Use this when the snippet alone isn't enough to rule. Returns " + "lines from the file with 1-based line numbers. Pagination " + "supported via offset and limit. Reads beyond a built-in cap " + "are truncated." + ), + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": ( + "Absolute path to the conversation history file. " + "You were given this in the user message." + ), + }, + "offset": { + "type": "integer", + "description": "1-indexed starting line number (default 1).", + "default": 1, + }, + "limit": { + "type": "integer", + "description": ( + f"Max lines to return (default {_JUDGE_READ_FILE_MAX_LINES})." + ), + "default": _JUDGE_READ_FILE_MAX_LINES, + }, + }, + "required": ["path"], + }, + }, +} + + +def _judge_read_file( + path: str, + *, + offset: int = 1, + limit: int = _JUDGE_READ_FILE_MAX_LINES, + allowed_path: Optional[Path] = None, +) -> str: + """Bounded read of the dumped conversation file. Returns JSON-serializable text. + + Restricted to ``allowed_path`` when provided — the judge cannot use this + tool to read arbitrary files. + """ + if not path: + return json.dumps({"error": "path is required"}) + try: + target = Path(path).resolve() + except Exception as exc: + return json.dumps({"error": f"path resolve failed: {exc}"}) + + if allowed_path is not None: + try: + allowed = allowed_path.resolve() + except Exception: + allowed = allowed_path + if target != allowed: + return json.dumps({ + "error": ( + f"read_file is restricted to the conversation dump path. " + f"Allowed: {allowed}" + ) + }) + + if not target.exists(): + return json.dumps({"error": f"file not found: {target}"}) + try: + offset = max(1, int(offset or 1)) + limit = max(1, min(int(limit or _JUDGE_READ_FILE_MAX_LINES), _JUDGE_READ_FILE_MAX_LINES)) + except (TypeError, ValueError): + return json.dumps({"error": "offset and limit must be integers"}) + + try: + with open(target, "r", encoding="utf-8", errors="replace") as fh: + lines = fh.readlines() + except Exception as exc: + return json.dumps({"error": f"read failed: {exc}"}) + + total = len(lines) + start = offset - 1 + end = min(start + limit, total) + slice_lines = lines[start:end] + out = "".join(slice_lines) + if len(out) > _JUDGE_READ_FILE_MAX_CHARS: + out = out[:_JUDGE_READ_FILE_MAX_CHARS] + "\n… [truncated by judge read cap]" + return json.dumps({ + "path": str(target), + "total_lines": total, + "offset": offset, + "returned": len(slice_lines), + "next_offset": end + 1 if end < total else None, + "content": out, + }, ensure_ascii=False) + + +# ────────────────────────────────────────────────────────────────────── +# Judge: phase-A (decompose) and phase-B (evaluate) +# ────────────────────────────────────────────────────────────────────── + + +def _get_judge_client() -> Tuple[Optional[Any], str]: + """Return (client, model) or (None, '') when unavailable.""" + try: + from agent.auxiliary_client import get_text_auxiliary_client + except Exception as exc: + logger.debug("goal judge: auxiliary client import failed: %s", exc) + return None, "" + try: + client, model = get_text_auxiliary_client("goal_judge") + except Exception as exc: + logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc) + return None, "" + if client is None or not model: + return None, "" + return client, model + + +def decompose_goal( + goal: str, + *, + timeout: float = DEFAULT_JUDGE_TIMEOUT, +) -> Tuple[List[Dict[str, Any]], Optional[str]]: + """Phase-A: ask the judge to break the goal into a checklist. + + Returns ``(items, error)``. On any failure, returns ``([], reason)`` so + the caller can decide whether to fall back to freeform mode. + """ + if not goal.strip(): + return [], "empty goal" + + client, model = _get_judge_client() + if client is None: + return [], "auxiliary client unavailable" + + try: + resp = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": DECOMPOSE_SYSTEM_PROMPT}, + { + "role": "user", + "content": DECOMPOSE_USER_PROMPT_TEMPLATE.format( + goal=_truncate(goal, 4000) + ), + }, + ], + temperature=0, + max_tokens=2000, + timeout=timeout, + ) + except Exception as exc: + logger.info("goal decompose: API call failed (%s)", exc) + return [], f"decompose error: {type(exc).__name__}" + + try: + raw = resp.choices[0].message.content or "" + except Exception: + raw = "" + + items, parse_failed = _parse_decompose_response(raw) + if parse_failed or not items: + logger.info( + "goal decompose: parse failed or empty checklist (raw=%r)", + _truncate(raw, 200), + ) + return [], "decompose parse failed or empty" + logger.info("goal decompose: produced %d checklist items", len(items)) + return items, None + + +def judge_goal_freeform( goal: str, last_response: str, *, timeout: float = DEFAULT_JUDGE_TIMEOUT, ) -> Tuple[str, str, bool]: - """Ask the auxiliary model whether the goal is satisfied. + """Legacy freeform judge — kept for goals with no checklist. Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``, - ``"continue"``, or ``"skipped"`` (when the judge couldn't be reached). - - ``parse_failed`` is True only when the judge call succeeded but its output - was unusable (empty or non-JSON). API/transport errors return False — they - are transient and should fail-open silently. Callers use this flag to - auto-pause after N consecutive parse failures (see - ``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``). - - This is deliberately fail-open: any error returns ``("continue", "...", False)`` - so a broken judge doesn't wedge progress — the turn budget and the - consecutive-parse-failures auto-pause are the backstops. + ``"continue"``, or ``"skipped"``. """ if not goal.strip(): return "skipped", "empty goal", False if not last_response.strip(): - # No substantive reply this turn — almost certainly not done yet. return "continue", "empty response (nothing to evaluate)", False - try: - from agent.auxiliary_client import get_text_auxiliary_client - except Exception as exc: - logger.debug("goal judge: auxiliary client import failed: %s", exc) + client, model = _get_judge_client() + if client is None: return "continue", "auxiliary client unavailable", False - try: - client, model = get_text_auxiliary_client("goal_judge") - except Exception as exc: - logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc) - return "continue", "auxiliary client unavailable", False - - if client is None or not model: - return "continue", "no auxiliary client configured", False - - prompt = JUDGE_USER_PROMPT_TEMPLATE.format( + prompt = EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE.format( goal=_truncate(goal, 2000), response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS), ) @@ -330,7 +852,7 @@ def judge_goal( resp = client.chat.completions.create( model=model, messages=[ - {"role": "system", "content": JUDGE_SYSTEM_PROMPT}, + {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_FREEFORM}, {"role": "user", "content": prompt}, ], temperature=0, @@ -348,10 +870,178 @@ def judge_goal( done, reason, parse_failed = _parse_judge_response(raw) verdict = "done" if done else "continue" - logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120)) + logger.info("goal judge (freeform): verdict=%s reason=%s", verdict, _truncate(reason, 120)) return verdict, reason, parse_failed +def evaluate_checklist( + state: GoalState, + last_response: str, + *, + history_path: Optional[Path], + timeout: float = DEFAULT_JUDGE_TIMEOUT, + max_tool_calls: int = DEFAULT_MAX_JUDGE_TOOL_CALLS, +) -> Tuple[Dict[str, Any], bool]: + """Phase-B: judge evaluates each pending checklist item. + + Runs a bounded tool loop so the judge can call ``read_file`` on the + dumped conversation history when the snippet isn't enough. + + Returns ``(parsed, parse_failed)`` where parsed is + ``{"updates": [...], "new_items": [...], "reason": str}``. + Falls open on transport errors: empty updates/new_items, parse_failed=False. + """ + client, model = _get_judge_client() + if client is None: + return ({"updates": [], "new_items": [], "reason": "auxiliary client unavailable"}, False) + + # Render checklist with 1-based indices the judge can address. + checklist_block = state.render_checklist(numbered=True) + + user_prompt = EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE.format( + goal=_truncate(state.goal, 2000), + checklist_block=checklist_block, + response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS), + history_path=str(history_path) if history_path else "(unavailable — judge from snippet only)", + ) + + messages: List[Dict[str, Any]] = [ + {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_CHECKLIST}, + {"role": "user", "content": user_prompt}, + ] + + # Some auxiliary providers may not support tool calls. We pass tools + # optimistically; if the provider returns a verdict directly without + # using them, we just parse it. + tools = [_JUDGE_READ_FILE_TOOL_SCHEMA] if history_path is not None else None + + tool_calls_left = max(0, int(max_tool_calls)) + final_raw = "" + + for _ in range(tool_calls_left + 1): + try: + kwargs: Dict[str, Any] = { + "model": model, + "messages": messages, + "temperature": 0, + "max_tokens": 1500, + "timeout": timeout, + } + if tools: + kwargs["tools"] = tools + kwargs["tool_choice"] = "auto" + resp = client.chat.completions.create(**kwargs) + except Exception as exc: + logger.info("goal judge (checklist): API call failed (%s)", exc) + return ( + { + "updates": [], + "new_items": [], + "reason": f"judge error: {type(exc).__name__}", + }, + False, + ) + + try: + choice = resp.choices[0] + msg = choice.message + except Exception: + return ( + {"updates": [], "new_items": [], "reason": "judge response malformed"}, + True, + ) + + # Unpack tool_calls in a way that works for openai-py and other shims. + tool_calls = getattr(msg, "tool_calls", None) or [] + content = getattr(msg, "content", "") or "" + + if not tool_calls: + final_raw = content + break + + if tool_calls_left <= 0: + # Out of budget. Force a final ruling on the next pass by + # appending a system note and disabling tools. + messages.append({ + "role": "user", + "content": ( + "You have exhausted your read_file budget. Issue your " + "final JSON verdict now without calling more tools." + ), + }) + tools = None + continue + + # Append the assistant turn, then handle each tool call. + assistant_record: Dict[str, Any] = { + "role": "assistant", + "content": content, + "tool_calls": [], + } + for tc in tool_calls: + try: + tc_id = getattr(tc, "id", None) or "tc-?" + fn = getattr(tc, "function", None) + fn_name = getattr(fn, "name", "") if fn is not None else "" + fn_args = getattr(fn, "arguments", "") if fn is not None else "" + assistant_record["tool_calls"].append({ + "id": tc_id, + "type": "function", + "function": {"name": fn_name, "arguments": fn_args}, + }) + except Exception: + continue + messages.append(assistant_record) + + for tc in tool_calls: + try: + tc_id = getattr(tc, "id", None) or "tc-?" + fn = getattr(tc, "function", None) + fn_name = getattr(fn, "name", "") if fn is not None else "" + fn_args_raw = getattr(fn, "arguments", "") if fn is not None else "" + except Exception: + continue + try: + args = json.loads(fn_args_raw) if isinstance(fn_args_raw, str) else (fn_args_raw or {}) + except Exception: + args = {} + if fn_name == "read_file": + tool_result = _judge_read_file( + str(args.get("path", "")), + offset=args.get("offset", 1), + limit=args.get("limit", _JUDGE_READ_FILE_MAX_LINES), + allowed_path=history_path, + ) + else: + tool_result = json.dumps({"error": f"unknown tool: {fn_name}"}) + messages.append({ + "role": "tool", + "tool_call_id": tc_id, + "name": fn_name, + "content": tool_result, + }) + tool_calls_left -= 1 + + if tool_calls_left <= 0: + messages.append({ + "role": "user", + "content": ( + "You have exhausted your read_file budget. Issue your " + "final JSON verdict now without calling more tools." + ), + }) + tools = None + + parsed, parse_failed = _parse_evaluate_response(final_raw) + logger.info( + "goal judge (checklist): updates=%d new_items=%d reason=%s", + len(parsed.get("updates") or []), + len(parsed.get("new_items") or []), + _truncate(parsed.get("reason", ""), 120), + ) + return parsed, parse_failed + + # ────────────────────────────────────────────────────────────────────── # GoalManager — the orchestration surface CLI + gateway talk to # ────────────────────────────────────────────────────────────────────── @@ -368,8 +1058,12 @@ class GoalManager: - ``clear()`` — remove the active goal. - ``pause()`` / ``resume()`` — explicit user controls. - ``status()`` — printable one-liner. - - ``evaluate_after_turn(last_response)`` — call the judge, update state, - and return a decision dict the caller uses to drive the next turn. + - ``add_subgoal(text)`` — user appends a checklist item. + - ``mark_subgoal(index, status)`` — user flips an item (override). + - ``remove_subgoal(index)`` — user deletes an item. + - ``clear_checklist()`` — user wipes the checklist; next turn re-decomposes. + - ``evaluate_after_turn(last_response, agent=None)`` — call the judge, + update state, return a decision dict. - ``next_continuation_prompt()`` — the canonical user-role message to feed back into ``run_conversation``. """ @@ -396,14 +1090,26 @@ class GoalManager: if s is None or s.status in ("cleared",): return "No active goal. Set one with /goal ." turns = f"{s.turns_used}/{s.max_turns} turns" + cl_total, cl_done, cl_imp, _ = s.checklist_counts() + cl_text = "" + if cl_total: + cl_text = f", {cl_done + cl_imp}/{cl_total} done" if s.status == "active": - return f"⊙ Goal (active, {turns}): {s.goal}" + return f"⊙ Goal (active, {turns}{cl_text}): {s.goal}" if s.status == "paused": extra = f" — {s.paused_reason}" if s.paused_reason else "" - return f"⏸ Goal (paused, {turns}{extra}): {s.goal}" + return f"⏸ Goal (paused, {turns}{cl_text}{extra}): {s.goal}" if s.status == "done": - return f"✓ Goal done ({turns}): {s.goal}" - return f"Goal ({s.status}, {turns}): {s.goal}" + return f"✓ Goal done ({turns}{cl_text}): {s.goal}" + return f"Goal ({s.status}, {turns}{cl_text}): {s.goal}" + + def render_checklist(self) -> str: + """Public helper for the /subgoal slash command.""" + if self._state is None: + return "(no active goal)" + if not self._state.checklist: + return "(checklist empty — judge will populate it on the next turn)" + return self._state.render_checklist(numbered=True) # --- mutation ----------------------------------------------------- @@ -418,6 +1124,8 @@ class GoalManager: max_turns=int(max_turns) if max_turns else self.default_max_turns, created_at=time.time(), last_turn_at=0.0, + checklist=[], + decomposed=False, ) self._state = state save_goal(self.session_id, state) @@ -456,6 +1164,77 @@ class GoalManager: self._state.last_reason = reason save_goal(self.session_id, self._state) + # --- /subgoal user controls --------------------------------------- + + def add_subgoal(self, text: str) -> ChecklistItem: + """User appends a new checklist item. Requires an active or paused goal.""" + if self._state is None: + raise RuntimeError("no active goal") + text = (text or "").strip() + if not text: + raise ValueError("subgoal text is empty") + item = ChecklistItem( + text=text, + status=ITEM_PENDING, + added_by=ADDED_BY_USER, + added_at=time.time(), + ) + self._state.checklist.append(item) + save_goal(self.session_id, self._state) + return item + + def mark_subgoal(self, index_1based: int, status: str) -> ChecklistItem: + """User overrides an item's status. + + ``status`` may be ``completed``, ``impossible``, or ``pending`` + (the last only as an undo flow). Stickiness rules do NOT apply to + user actions — the user is the only authority that can revert + terminal items. + """ + if self._state is None: + raise RuntimeError("no active goal") + status = (status or "").strip().lower() + if status not in VALID_ITEM_STATUSES: + raise ValueError( + f"status must be one of {sorted(VALID_ITEM_STATUSES)}; got {status!r}" + ) + idx = int(index_1based) - 1 + if idx < 0 or idx >= len(self._state.checklist): + raise IndexError( + f"index out of range (1..{len(self._state.checklist)})" + ) + item = self._state.checklist[idx] + item.status = status + if status in TERMINAL_ITEM_STATUSES: + item.completed_at = time.time() + if not item.evidence: + item.evidence = "marked by user" + else: + item.completed_at = None + # Don't wipe judge-supplied evidence on undo — useful audit trail. + save_goal(self.session_id, self._state) + return item + + def remove_subgoal(self, index_1based: int) -> ChecklistItem: + if self._state is None: + raise RuntimeError("no active goal") + idx = int(index_1based) - 1 + if idx < 0 or idx >= len(self._state.checklist): + raise IndexError( + f"index out of range (1..{len(self._state.checklist)})" + ) + removed = self._state.checklist.pop(idx) + save_goal(self.session_id, self._state) + return removed + + def clear_checklist(self) -> None: + """Wipe the checklist and reset decomposed=False so the judge re-decomposes.""" + if self._state is None: + return + self._state.checklist = [] + self._state.decomposed = False + save_goal(self.session_id, self._state) + # --- the main entry point called after every turn ----------------- def evaluate_after_turn( @@ -463,6 +1242,8 @@ class GoalManager: last_response: str, *, user_initiated: bool = True, + agent: Any = None, + messages: Optional[List[Dict[str, Any]]] = None, ) -> Dict[str, Any]: """Run the judge and update state. Return a decision dict. @@ -470,11 +1251,21 @@ class GoalManager: continuation prompt we fed ourselves (False). Both increment ``turns_used`` because both consume model budget. + ``messages`` is the agent's full conversation list for this session. + When provided, it's dumped to ``/goals/.json`` so + the Phase-B judge's read_file tool can inspect history. Optional — + when missing, the judge runs from the snippet only. + + ``agent`` is a back-compat path — when ``messages`` is None we try + to extract them from common AIAgent attribute names. Most callers + should pass ``messages`` directly because AIAgent does not store + the message list as a public instance attribute. + Decision keys: - ``status``: current goal status after update - ``should_continue``: bool — caller should fire another turn - ``continuation_prompt``: str or None - - ``verdict``: "done" | "continue" | "skipped" | "inactive" + - ``verdict``: "done" | "continue" | "skipped" | "inactive" | "decompose" - ``reason``: str - ``message``: user-visible one-liner to print/send """ @@ -493,7 +1284,45 @@ class GoalManager: state.turns_used += 1 state.last_turn_at = time.time() - verdict, reason, parse_failed = judge_goal(state.goal, last_response) + # ── Phase A: decompose (first call after /goal set) ─────────── + if not state.decomposed: + items, err = decompose_goal(state.goal) + state.decomposed = True + decompose_message = "" + if items: + now = time.time() + for entry in items: + state.checklist.append( + ChecklistItem( + text=entry["text"], + status=ITEM_PENDING, + added_by=ADDED_BY_JUDGE, + added_at=now, + ) + ) + state.last_verdict = "decompose" + state.last_reason = f"decomposed into {len(items)} items" + decompose_message = ( + f"⊙ Goal checklist created ({len(items)} items). " + f"Use /subgoal to view or edit it." + ) + save_goal(self.session_id, state) + return { + "status": "active", + "should_continue": True, + "continuation_prompt": self.next_continuation_prompt(), + "verdict": "decompose", + "reason": state.last_reason, + "message": decompose_message, + } + # Decompose failed — fall through to freeform mode below. + logger.info("goal: decompose failed (%s) — falling back to freeform judge", err) + state.last_reason = f"decompose failed: {err}" + + # ── Phase B: evaluate ──────────────────────────────────────── + verdict, reason, parse_failed = self._evaluate_state_phase_b( + state, last_response, agent=agent, messages=messages + ) state.last_verdict = verdict state.last_reason = reason @@ -518,11 +1347,7 @@ class GoalManager: } # Auto-pause when the judge model can't produce the expected JSON - # verdict N turns in a row. Points the user at the goal_judge config - # so they can route this side task to a model that follows the - # contract (e.g. google/gemini-3-flash-preview). Without this guard, - # weak judge models burn the entire turn budget returning prose or - # empty strings. + # verdict N turns in a row. if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES: state.status = "paused" state.paused_reason = ( @@ -564,6 +1389,10 @@ class GoalManager: } save_goal(self.session_id, state) + cl_total, cl_done, cl_imp, _ = state.checklist_counts() + progress = "" + if cl_total: + progress = f" — {cl_done + cl_imp}/{cl_total} done" return { "status": "active", "should_continue": True, @@ -571,23 +1400,168 @@ class GoalManager: "verdict": "continue", "reason": reason, "message": ( - f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}): {reason}" + f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}{progress}): {reason}" ), } + def _evaluate_state_phase_b( + self, + state: GoalState, + last_response: str, + *, + agent: Any = None, + messages: Optional[List[Dict[str, Any]]] = None, + ) -> Tuple[str, str, bool]: + """Run the right kind of Phase-B evaluation given current state. + + With a non-empty checklist: harsh per-item evaluation with a bounded + read_file tool loop. + + With an empty checklist (e.g. decompose failed twice): fall back to + the legacy freeform judge so the goal still has a way to terminate. + """ + if not last_response.strip(): + return "continue", "empty response (nothing to evaluate)", False + + if state.checklist: + # Dump conversation history if we have one. Prefer explicit + # ``messages`` arg (most reliable); fall back to extracting from + # the agent instance for back-compat. + history_path: Optional[Path] = None + msgs: List[Dict[str, Any]] = [] + if messages: + msgs = list(messages) + elif agent is not None: + msgs = self._extract_agent_messages(agent) + if msgs: + history_path = dump_conversation(self.session_id, msgs) + if history_path is None: + logger.debug( + "goal: conversation dump failed for session %s", + self.session_id, + ) + else: + logger.debug( + "goal: no messages available for session %s — judge will run from snippet only", + self.session_id, + ) + + parsed, parse_failed = evaluate_checklist( + state, last_response, history_path=history_path + ) + self._apply_checklist_updates(state, parsed) + + if state.all_terminal(): + return "done", parsed.get("reason") or "all checklist items terminal", parse_failed + return "continue", parsed.get("reason") or "checklist progress", parse_failed + + # No checklist — freeform fallback. + verdict, reason, parse_failed = judge_goal_freeform(state.goal, last_response) + return verdict, reason, parse_failed + + # --- internal helpers --------------------------------------------- + + @staticmethod + def _extract_agent_messages(agent: Any) -> List[Dict[str, Any]]: + """Best-effort extraction of the agent's conversation history. + + Tries common attribute names so we don't tightly couple to AIAgent. + Returns an empty list when nothing is available. + """ + for attr in ("messages", "conversation_history", "_messages", "history"): + try: + msgs = getattr(agent, attr, None) + if isinstance(msgs, list) and msgs: + return msgs + except Exception: + continue + return [] + + @staticmethod + def _apply_checklist_updates(state: GoalState, parsed: Dict[str, Any]) -> None: + """Apply judge updates with stickiness: never regress terminal items.""" + now = time.time() + for upd in parsed.get("updates") or []: + try: + idx = int(upd["index"]) + except (KeyError, TypeError, ValueError): + continue + if idx < 0 or idx >= len(state.checklist): + continue + item = state.checklist[idx] + if item.status in TERMINAL_ITEM_STATUSES: + # Stickiness: judge cannot regress a terminal item. + continue + new_status = upd.get("status") + if new_status not in TERMINAL_ITEM_STATUSES: + continue + item.status = new_status + item.completed_at = now + evidence = upd.get("evidence") + if evidence: + item.evidence = evidence + + for new_item in parsed.get("new_items") or []: + text = (new_item.get("text") or "").strip() + if not text: + continue + state.checklist.append( + ChecklistItem( + text=text, + status=ITEM_PENDING, + added_by=ADDED_BY_JUDGE, + added_at=now, + ) + ) + + # --- continuation prompt ------------------------------------------ + def next_continuation_prompt(self) -> Optional[str]: if not self._state or self._state.status != "active": return None - return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal) + if not self._state.checklist: + return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal) + cl_total, cl_done, cl_imp, _ = self._state.checklist_counts() + return CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE.format( + goal=self._state.goal, + done=cl_done + cl_imp, + total=cl_total, + checklist=self._state.render_checklist(numbered=False), + ) + + +# Public name kept for back-compat with the previous freeform-only API. +def judge_goal( + goal: str, + last_response: str, + *, + timeout: float = DEFAULT_JUDGE_TIMEOUT, +) -> Tuple[str, str, bool]: + """Back-compat wrapper — defers to the freeform judge.""" + return judge_goal_freeform(goal, last_response, timeout=timeout) __all__ = [ + "ChecklistItem", "GoalState", "GoalManager", "CONTINUATION_PROMPT_TEMPLATE", + "CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE", "DEFAULT_MAX_TURNS", + "DEFAULT_MAX_JUDGE_TOOL_CALLS", + "ITEM_PENDING", + "ITEM_COMPLETED", + "ITEM_IMPOSSIBLE", + "ITEM_MARKERS", + "TERMINAL_ITEM_STATUSES", + "VALID_ITEM_STATUSES", "load_goal", "save_goal", "clear_goal", "judge_goal", + "judge_goal_freeform", + "decompose_goal", + "evaluate_checklist", + "conversation_dump_path", + "dump_conversation", ] diff --git a/tests/cli/test_cli_goal_interrupt.py b/tests/cli/test_cli_goal_interrupt.py index 851b87e856b..879e87c6a73 100644 --- a/tests/cli/test_cli_goal_interrupt.py +++ b/tests/cli/test_cli_goal_interrupt.py @@ -58,6 +58,11 @@ def _make_cli_with_goal(session_id: str, goal_text: str = "build a thing"): mgr = GoalManager(session_id=session_id, default_max_turns=5) mgr.set(goal_text) + # Skip Phase-A decompose so tests can patch judge_goal_freeform directly + # for legacy verdict assertions. + mgr.state.decomposed = True + from hermes_cli.goals import save_goal as _sg + _sg(mgr.session_id, mgr.state) cli._goal_manager = mgr return cli, mgr @@ -81,7 +86,7 @@ class TestInterruptAutoPause: # Judge MUST NOT run on an interrupted turn. If it does, we've # regressed — fail loudly instead of silently querying a mock. - with patch("hermes_cli.goals.judge_goal") as judge_mock: + with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock: judge_mock.side_effect = AssertionError( "judge_goal called on an interrupted turn" ) @@ -106,7 +111,7 @@ class TestInterruptAutoPause: cli.conversation_history = [ {"role": "assistant", "content": "partial"}, ] - with patch("hermes_cli.goals.judge_goal"): + with patch("hermes_cli.goals.judge_goal_freeform"): cli._maybe_continue_goal_after_turn() assert mgr.state.status == "paused" @@ -125,7 +130,7 @@ class TestEmptyResponseSkip: {"role": "assistant", "content": " \n\n "}, ] - with patch("hermes_cli.goals.judge_goal") as judge_mock: + with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock: judge_mock.side_effect = AssertionError( "judge_goal called on an empty response" ) @@ -144,7 +149,7 @@ class TestEmptyResponseSkip: {"role": "user", "content": "go"}, ] - with patch("hermes_cli.goals.judge_goal") as judge_mock: + with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock: judge_mock.side_effect = AssertionError( "judge_goal called without an assistant response" ) @@ -169,7 +174,7 @@ class TestHealthyTurnStillRuns: # Force the judge to say "continue" without touching the network. with patch( - "hermes_cli.goals.judge_goal", + "hermes_cli.goals.judge_goal_freeform", return_value=("continue", "needs more steps", False), ): cli._maybe_continue_goal_after_turn() @@ -189,7 +194,7 @@ class TestHealthyTurnStillRuns: ] with patch( - "hermes_cli.goals.judge_goal", + "hermes_cli.goals.judge_goal_freeform", return_value=("done", "goal satisfied", False), ): cli._maybe_continue_goal_after_turn() diff --git a/tests/gateway/test_goal_verdict_send.py b/tests/gateway/test_goal_verdict_send.py index 14f536aa4f8..ce9e8d6ad1e 100644 --- a/tests/gateway/test_goal_verdict_send.py +++ b/tests/gateway/test_goal_verdict_send.py @@ -106,8 +106,11 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home): mgr = GoalManager(session_entry.session_id) mgr.set("ship the feature") + mgr.state.decomposed = True + from hermes_cli.goals import save_goal as _sg + _sg(mgr.session_id, mgr.state) - with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)): + with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "the feature shipped", False)): await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, @@ -135,8 +138,11 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home): mgr = GoalManager(session_entry.session_id) mgr.set("polish the docs") + mgr.state.decomposed = True + from hermes_cli.goals import save_goal as _sg + _sg(mgr.session_id, mgr.state) - with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)): + with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "still needs work", False)): await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, @@ -164,7 +170,7 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home): state.turns_used = 2 save_goal(session_entry.session_id, state) - with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)): + with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "keep going", False)): await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, @@ -211,7 +217,7 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home): runner.adapters[Platform.TELEGRAM] = _NoSendAdapter() - with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)): + with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "ok", False)): # must not raise await runner._post_turn_goal_continuation( session_entry=session_entry, diff --git a/tests/hermes_cli/test_goals.py b/tests/hermes_cli/test_goals.py index b5afd716c9e..47a043a32cf 100644 --- a/tests/hermes_cli/test_goals.py +++ b/tests/hermes_cli/test_goals.py @@ -253,14 +253,20 @@ class TestGoalManager: assert mgr2.is_active() def test_evaluate_after_turn_done(self, hermes_home): - """Judge says done → status=done, no continuation.""" + """Judge says done → status=done, no continuation. + + Skips Phase-A decompose by patching ``decompose_goal`` to return + an empty checklist so the manager falls through to the freeform + judge path (legacy behavior preserved when decompose is unavailable). + """ from hermes_cli import goals from hermes_cli.goals import GoalManager mgr = GoalManager(session_id="eval-sid-1") mgr.set("ship it") - with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object(goals, "judge_goal_freeform", return_value=("done", "shipped", False)): decision = mgr.evaluate_after_turn("I shipped the feature.") assert decision["verdict"] == "done" @@ -276,7 +282,8 @@ class TestGoalManager: mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5) mgr.set("a long goal") - with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object(goals, "judge_goal_freeform", return_value=("continue", "more work", False)): decision = mgr.evaluate_after_turn("made some progress") assert decision["verdict"] == "continue" @@ -294,7 +301,8 @@ class TestGoalManager: mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2) mgr.set("hard goal") - with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object(goals, "judge_goal_freeform", return_value=("continue", "not yet", False)): d1 = mgr.evaluate_after_turn("step 1") assert d1["should_continue"] is True assert mgr.state.turns_used == 1 @@ -434,9 +442,11 @@ class TestJudgeParseFailureAutoPause: mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20) mgr.set("do a thing") - with patch.object( - goals, "judge_goal", return_value=("continue", "judge returned empty response", True) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "judge returned empty response", True), + ): d1 = mgr.evaluate_after_turn("step 1") assert d1["should_continue"] is True assert mgr.state.consecutive_parse_failures == 1 @@ -463,17 +473,21 @@ class TestJudgeParseFailureAutoPause: mgr.set("another goal") # Two parse failures… - with patch.object( - goals, "judge_goal", return_value=("continue", "not json", True) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "not json", True), + ): mgr.evaluate_after_turn("step 1") mgr.evaluate_after_turn("step 2") assert mgr.state.consecutive_parse_failures == 2 # …then one clean reply resets the counter. - with patch.object( - goals, "judge_goal", return_value=("continue", "making progress", False) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "making progress", False), + ): d = mgr.evaluate_after_turn("step 3") assert d["should_continue"] is True assert mgr.state.consecutive_parse_failures == 0 @@ -486,9 +500,11 @@ class TestJudgeParseFailureAutoPause: mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20) mgr.set("goal") - with patch.object( - goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "judge error: RuntimeError", False), + ): for _ in range(5): d = mgr.evaluate_after_turn("still going") assert d["should_continue"] is True @@ -505,12 +521,553 @@ class TestJudgeParseFailureAutoPause: mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20) mgr.set("persistent goal") - with patch.object( - goals, "judge_goal", return_value=("continue", "empty", True) - ): + with patch.object(goals, "decompose_goal", return_value=([], "stub")), \ + patch.object( + goals, "judge_goal_freeform", + return_value=("continue", "empty", True), + ): mgr.evaluate_after_turn("r") mgr.evaluate_after_turn("r") reloaded = load_goal("parse-fail-sid-4") assert reloaded is not None assert reloaded.consecutive_parse_failures == 2 + + +# ────────────────────────────────────────────────────────────────────── +# Checklist mode: GoalState backcompat + ChecklistItem +# ────────────────────────────────────────────────────────────────────── + + +class TestGoalStateBackcompat: + def test_old_state_meta_row_loads_without_checklist_fields(self): + """A goal serialized BEFORE the checklist fields existed must + round-trip through GoalState.from_json with empty defaults.""" + from hermes_cli.goals import GoalState + + legacy_json = json.dumps({ + "goal": "do the thing", + "status": "active", + "turns_used": 3, + "max_turns": 20, + "created_at": 1.0, + "last_turn_at": 2.0, + "last_verdict": "continue", + "last_reason": "still working", + "paused_reason": None, + "consecutive_parse_failures": 1, + }) + state = GoalState.from_json(legacy_json) + assert state.goal == "do the thing" + assert state.checklist == [] + assert state.decomposed is False + + def test_new_state_round_trip(self): + from hermes_cli.goals import ( + ChecklistItem, + GoalState, + ITEM_COMPLETED, + ITEM_PENDING, + ADDED_BY_JUDGE, + ADDED_BY_USER, + ) + + state = GoalState( + goal="g", + decomposed=True, + checklist=[ + ChecklistItem(text="a", status=ITEM_COMPLETED, + added_by=ADDED_BY_JUDGE, evidence="done"), + ChecklistItem(text="b", status=ITEM_PENDING, + added_by=ADDED_BY_USER), + ], + ) + round_tripped = GoalState.from_json(state.to_json()) + assert round_tripped.decomposed is True + assert len(round_tripped.checklist) == 2 + assert round_tripped.checklist[0].text == "a" + assert round_tripped.checklist[0].status == ITEM_COMPLETED + assert round_tripped.checklist[0].evidence == "done" + assert round_tripped.checklist[1].added_by == ADDED_BY_USER + + def test_checklist_counts_and_all_terminal(self): + from hermes_cli.goals import ( + ChecklistItem, GoalState, + ITEM_COMPLETED, ITEM_IMPOSSIBLE, ITEM_PENDING, + ) + + state = GoalState( + goal="g", + checklist=[ + ChecklistItem(text="a", status=ITEM_COMPLETED), + ChecklistItem(text="b", status=ITEM_IMPOSSIBLE), + ChecklistItem(text="c", status=ITEM_PENDING), + ], + ) + total, done, imp, pending = state.checklist_counts() + assert (total, done, imp, pending) == (3, 1, 1, 1) + assert state.all_terminal() is False + + state.checklist[2].status = ITEM_IMPOSSIBLE + assert state.all_terminal() is True + + def test_empty_checklist_is_not_all_terminal(self): + """Empty list must NOT be considered done.""" + from hermes_cli.goals import GoalState + + state = GoalState(goal="g") + assert state.all_terminal() is False + + +# ────────────────────────────────────────────────────────────────────── +# Phase A: decompose +# ────────────────────────────────────────────────────────────────────── + + +class TestPhaseADecompose: + def test_decompose_writes_checklist_and_marks_decomposed(self, hermes_home): + from hermes_cli import goals + from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_JUDGE + + mgr = GoalManager(session_id="phase-a-sid-1") + mgr.set("build a website") + + items = [{"text": "homepage exists"}, {"text": "is mobile-friendly"}] + with patch.object(goals, "decompose_goal", return_value=(items, None)): + d = mgr.evaluate_after_turn("(initial response)") + + assert d["verdict"] == "decompose" + assert d["should_continue"] is True + # Phase A produces a continuation prompt that includes the checklist. + assert d["continuation_prompt"] is not None + assert "Checklist progress" in d["continuation_prompt"] + assert mgr.state.decomposed is True + assert len(mgr.state.checklist) == 2 + assert mgr.state.checklist[0].text == "homepage exists" + assert mgr.state.checklist[0].status == ITEM_PENDING + assert mgr.state.checklist[0].added_by == ADDED_BY_JUDGE + + def test_decompose_only_runs_once(self, hermes_home): + """Decomposed=True after first call. Subsequent calls go to Phase B.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="phase-a-sid-2") + mgr.set("g") + + with patch.object( + goals, "decompose_goal", return_value=([{"text": "x"}], None) + ) as decompose_mock, patch.object( + goals, "evaluate_checklist", + return_value=({"updates": [], "new_items": [], "reason": "..."}, False), + ) as eval_mock: + mgr.evaluate_after_turn("turn 1") + mgr.evaluate_after_turn("turn 2") + mgr.evaluate_after_turn("turn 3") + + assert decompose_mock.call_count == 1 + assert eval_mock.call_count == 2 + + def test_decompose_failure_falls_back_to_freeform(self, hermes_home): + """If decompose returns no items, manager falls through to freeform judge.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="phase-a-sid-3") + mgr.set("g") + + with patch.object(goals, "decompose_goal", return_value=([], "model error")), \ + patch.object(goals, "judge_goal_freeform", + return_value=("done", "shipped", False)): + d = mgr.evaluate_after_turn("done!") + + assert d["verdict"] == "done" + assert mgr.state.decomposed is True + assert mgr.state.checklist == [] + + +# ────────────────────────────────────────────────────────────────────── +# Phase B: evaluate (checklist mode) +# ────────────────────────────────────────────────────────────────────── + + +class TestPhaseBChecklist: + def _make_decomposed_mgr(self, sid: str, items): + """Helper: skip Phase A, install a decomposed checklist directly.""" + from hermes_cli.goals import ( + GoalManager, ChecklistItem, ITEM_PENDING, ADDED_BY_JUDGE, + ) + from hermes_cli import goals as _g + mgr = GoalManager(session_id=sid) + mgr.set("a goal") + mgr.state.decomposed = True + mgr.state.checklist = [ + ChecklistItem(text=t, status=ITEM_PENDING, added_by=ADDED_BY_JUDGE) + for t in items + ] + _g.save_goal(sid, mgr.state) + return mgr + + def test_judge_flips_pending_to_completed(self, hermes_home): + from hermes_cli import goals + from hermes_cli.goals import ITEM_COMPLETED, ITEM_PENDING + + mgr = self._make_decomposed_mgr("phase-b-1", ["a", "b", "c"]) + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [ + {"index": 0, "status": "completed", "evidence": "done"}, + {"index": 1, "status": "completed", "evidence": "shipped"}, + ], + "new_items": [], + "reason": "made progress", + }, + False, + ), + ): + d = mgr.evaluate_after_turn("agent did stuff") + + assert d["verdict"] == "continue" + assert mgr.state.checklist[0].status == ITEM_COMPLETED + assert mgr.state.checklist[0].evidence == "done" + assert mgr.state.checklist[1].status == ITEM_COMPLETED + assert mgr.state.checklist[2].status == ITEM_PENDING + + def test_goal_done_when_all_items_terminal(self, hermes_home): + from hermes_cli import goals + + mgr = self._make_decomposed_mgr("phase-b-2", ["a", "b"]) + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [ + {"index": 0, "status": "completed", "evidence": "ok"}, + {"index": 1, "status": "impossible", "evidence": "blocked"}, + ], + "new_items": [], + "reason": "all done or blocked", + }, + False, + ), + ): + d = mgr.evaluate_after_turn("response") + + assert d["verdict"] == "done" + assert d["should_continue"] is False + assert mgr.state.status == "done" + + def test_stickiness_judge_cannot_regress_completed(self, hermes_home): + """Once an item is completed, judge updates trying to flip it back are ignored.""" + from hermes_cli import goals + from hermes_cli.goals import ITEM_COMPLETED + + mgr = self._make_decomposed_mgr("phase-b-stick", ["a"]) + # First turn completes item 0. + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [{"index": 0, "status": "completed", "evidence": "yes"}], + "new_items": [], + "reason": "done", + }, + False, + ), + ): + mgr.evaluate_after_turn("turn 1") + assert mgr.state.checklist[0].status == ITEM_COMPLETED + # Second turn: judge tries to send a non-terminal update. + # _parse_evaluate_response already filters non-terminal, but at the + # apply layer we also skip terminal items entirely. Smoke both. + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [{"index": 0, "status": "impossible", "evidence": "regress"}], + "new_items": [], + "reason": "trying to regress", + }, + False, + ), + ): + mgr.evaluate_after_turn("turn 2") + # Sticky: status stays completed, evidence unchanged. + assert mgr.state.checklist[0].status == ITEM_COMPLETED + assert mgr.state.checklist[0].evidence == "yes" + + def test_judge_appends_new_items(self, hermes_home): + from hermes_cli import goals + + mgr = self._make_decomposed_mgr("phase-b-new", ["a"]) + with patch.object( + goals, "evaluate_checklist", + return_value=( + { + "updates": [], + "new_items": [{"text": "newly discovered"}, {"text": "also this"}], + "reason": "found more work", + }, + False, + ), + ): + mgr.evaluate_after_turn("response") + assert len(mgr.state.checklist) == 3 + assert mgr.state.checklist[1].text == "newly discovered" + assert mgr.state.checklist[1].added_by == "judge" + + +# ────────────────────────────────────────────────────────────────────── +# /subgoal user controls +# ────────────────────────────────────────────────────────────────────── + + +class TestSubgoalUserControls: + def test_add_subgoal_appends_user_item(self, hermes_home): + from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_USER + + mgr = GoalManager(session_id="user-sid-1") + mgr.set("g") + item = mgr.add_subgoal("user added") + assert item.text == "user added" + assert item.status == ITEM_PENDING + assert item.added_by == ADDED_BY_USER + assert len(mgr.state.checklist) == 1 + + def test_add_subgoal_requires_active_goal(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-2") + with pytest.raises(RuntimeError): + mgr.add_subgoal("x") + + def test_add_subgoal_rejects_empty_text(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-3") + mgr.set("g") + with pytest.raises(ValueError): + mgr.add_subgoal(" ") + + def test_mark_subgoal_uses_1_based_index(self, hermes_home): + from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_IMPOSSIBLE + mgr = GoalManager(session_id="user-sid-4") + mgr.set("g") + mgr.add_subgoal("a") + mgr.add_subgoal("b") + mgr.add_subgoal("c") + mgr.mark_subgoal(2, "completed") + mgr.mark_subgoal(3, "impossible") + assert mgr.state.checklist[0].status == "pending" + assert mgr.state.checklist[1].status == ITEM_COMPLETED + assert mgr.state.checklist[2].status == ITEM_IMPOSSIBLE + + def test_mark_subgoal_rejects_invalid_index(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-5") + mgr.set("g") + mgr.add_subgoal("a") + with pytest.raises(IndexError): + mgr.mark_subgoal(5, "completed") + with pytest.raises(IndexError): + mgr.mark_subgoal(0, "completed") + + def test_user_can_revert_terminal_item(self, hermes_home): + """User mark_subgoal bypasses stickiness — only path to revert.""" + from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_PENDING + mgr = GoalManager(session_id="user-sid-6") + mgr.set("g") + mgr.add_subgoal("a") + mgr.mark_subgoal(1, "completed") + assert mgr.state.checklist[0].status == ITEM_COMPLETED + mgr.mark_subgoal(1, "pending") + assert mgr.state.checklist[0].status == ITEM_PENDING + + def test_remove_subgoal(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-7") + mgr.set("g") + mgr.add_subgoal("a") + mgr.add_subgoal("b") + mgr.add_subgoal("c") + removed = mgr.remove_subgoal(2) + assert removed.text == "b" + assert [it.text for it in mgr.state.checklist] == ["a", "c"] + + def test_clear_checklist_resets_decomposed(self, hermes_home): + from hermes_cli.goals import GoalManager + mgr = GoalManager(session_id="user-sid-8") + mgr.set("g") + mgr.state.decomposed = True + mgr.add_subgoal("a") + mgr.clear_checklist() + assert mgr.state.checklist == [] + assert mgr.state.decomposed is False + + +# ────────────────────────────────────────────────────────────────────── +# Conversation dump +# ────────────────────────────────────────────────────────────────────── + + +class TestConversationDump: + def test_dump_writes_messages_to_goals_dir(self, hermes_home): + from hermes_cli.goals import dump_conversation, conversation_dump_path + + msgs = [ + {"role": "user", "content": "hi"}, + {"role": "assistant", "content": "hello"}, + ] + path = dump_conversation("dump-sid-1", msgs) + assert path is not None + assert path.exists() + # Path is under /goals/.json + assert path.parent.name == "goals" + assert path.name == "dump-sid-1.json" + + loaded = json.loads(path.read_text()) + assert loaded == msgs + + # conversation_dump_path returns the same path + assert conversation_dump_path("dump-sid-1") == path + + def test_dump_handles_unsafe_session_id(self, hermes_home): + from hermes_cli.goals import dump_conversation + + path = dump_conversation("evil/../../sid", [{"role": "user", "content": "x"}]) + assert path is not None + # No traversal — slashes are normalized to underscores. (Periods are + # preserved because they're legitimate in filenames; the resulting + # name still cannot escape /goals/ since path + # separators are gone.) + assert "/" not in path.name + assert path.parent.name == "goals" + # Verify the resolved path stays under the goals dir. + from hermes_cli.goals import _goals_dump_dir + goals_dir = _goals_dump_dir().resolve() + assert str(path.resolve()).startswith(str(goals_dir)) + + def test_dump_skips_when_messages_empty(self, hermes_home): + from hermes_cli.goals import dump_conversation + assert dump_conversation("sid", []) is None + assert dump_conversation("", [{"role": "user", "content": "x"}]) is None + + +# ────────────────────────────────────────────────────────────────────── +# Judge read_file tool: path restriction +# ────────────────────────────────────────────────────────────────────── + + +class TestJudgeReadFile: + def test_restricted_to_allowed_path(self, hermes_home, tmp_path): + from hermes_cli.goals import _judge_read_file + + allowed = tmp_path / "allowed.json" + allowed.write_text("hello\nworld\n") + + ok = _judge_read_file(str(allowed), allowed_path=allowed) + loaded = json.loads(ok) + assert loaded["content"].startswith("hello") + + # Try to read a different file. + sneaky = tmp_path / "secret.txt" + sneaky.write_text("nope\n") + denied = _judge_read_file(str(sneaky), allowed_path=allowed) + loaded = json.loads(denied) + assert "error" in loaded + assert "restricted" in loaded["error"] + + def test_pagination(self, hermes_home, tmp_path): + from hermes_cli.goals import _judge_read_file + f = tmp_path / "big.json" + f.write_text("\n".join(f"line-{i}" for i in range(50)) + "\n") + + # offset=10, limit=5 should return lines 10..14. + result = json.loads(_judge_read_file(str(f), offset=10, limit=5, allowed_path=f)) + assert result["returned"] == 5 + assert "line-9" in result["content"] # 1-based: line 10 == zero-indexed 9 + assert result["next_offset"] == 15 + + +# ────────────────────────────────────────────────────────────────────── +# Index conversion: judge emits 1-based, apply layer uses 0-based +# ────────────────────────────────────────────────────────────────────── + + +class TestJudgeIndexConversion: + def test_parse_evaluate_converts_1based_to_0based(self): + """The judge sees the checklist with 1-based indices (rendered as + '1. [ ] foo, 2. [ ] bar'). It emits updates with those same indices. + ``_parse_evaluate_response`` must convert them to 0-based so the + apply layer can index ``state.checklist`` directly. + """ + from hermes_cli.goals import _parse_evaluate_response + + raw = ''' + {"updates": [ + {"index": 1, "status": "completed", "evidence": "first item"}, + {"index": 3, "status": "impossible", "evidence": "third item"} + ], + "new_items": [], + "reason": "evaluated"} + ''' + parsed, parse_failed = _parse_evaluate_response(raw) + assert parse_failed is False + # 1 → 0, 3 → 2 + assert [u["index"] for u in parsed["updates"]] == [0, 2] + assert parsed["updates"][0]["evidence"] == "first item" + assert parsed["updates"][1]["status"] == "impossible" + + def test_full_round_trip_judge_index_to_state(self, hermes_home): + """End-to-end: judge emits 1-based, parser converts, apply layer + flips the right items in state.checklist.""" + from hermes_cli import goals + from hermes_cli.goals import ( + GoalManager, ChecklistItem, ITEM_PENDING, ITEM_COMPLETED, + ADDED_BY_JUDGE, + ) + + mgr = GoalManager(session_id="idx-round-trip") + mgr.set("g") + mgr.state.decomposed = True + mgr.state.checklist = [ + ChecklistItem(text="first", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE), + ChecklistItem(text="second", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE), + ChecklistItem(text="third", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE), + ] + goals.save_goal("idx-round-trip", mgr.state) + + # Simulate the judge returning a raw-JSON Phase-B reply via the + # auxiliary client: the parser handles the 1-based → 0-based + # conversion so the apply layer flips item 1 (text="first"). + class FakeMessage: + content = ''' + {"updates": [{"index": 1, "status": "completed", "evidence": "first done"}], + "new_items": [], + "reason": "..."} + ''' + tool_calls = None + + class FakeChoice: + message = FakeMessage() + + class FakeResponse: + choices = [FakeChoice()] + + class FakeClient: + class chat: + class completions: + @staticmethod + def create(**kwargs): + return FakeResponse() + + with patch.object(goals, "_get_judge_client", return_value=(FakeClient, "fake-model")): + mgr.evaluate_after_turn("ran the script and item 1 is done") + + # Item 1 (text="first") should now be completed. + assert mgr.state.checklist[0].text == "first" + assert mgr.state.checklist[0].status == ITEM_COMPLETED + assert mgr.state.checklist[0].evidence == "first done" + # Other items still pending. + assert mgr.state.checklist[1].status == ITEM_PENDING + assert mgr.state.checklist[2].status == ITEM_PENDING diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 73b2fc1a811..07febffaf98 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -3251,6 +3251,7 @@ def _run_prompt_submit(rid, sid: str, session: dict, text: Any) -> None: decision = goal_mgr.evaluate_after_turn( raw, user_initiated=True, + messages=list(session.get("history") or []), ) verdict_msg = decision.get("message") or "" if verdict_msg: