diff --git a/cli.py b/cli.py
index b3b6a73ffe2..e54acc1c76c 100644
--- a/cli.py
+++ b/cli.py
@@ -7244,6 +7244,8 @@ class HermesCLI:
                 _cprint(f"  No agent running; queued as next turn: {payload[:80]}{'...' if len(payload) > 80 else ''}")
         elif canonical == "goal":
             self._handle_goal_command(cmd_original)
+        elif canonical == "subgoal":
+            self._handle_subgoal_command(cmd_original)
         elif canonical == "skin":
             self._handle_skin_command(cmd_original)
         elif canonical == "voice":
@@ -7840,6 +7842,103 @@ class HermesCLI:
         except Exception:
             pass
 
+    def _handle_subgoal_command(self, cmd: str) -> None:
+        """Dispatch /subgoal subcommands.
+
+        Forms:
+          /subgoal                              show the checklist
+          /subgoal <text>                       append a user item
+          /subgoal complete <n>                 mark item n completed
+          /subgoal impossible <n>               mark item n impossible
+          /subgoal undo <n>                     revert item n to pending
+          /subgoal remove <n>                   delete item n
+          /subgoal clear                        wipe the checklist (judge re-decomposes)
+        """
+        parts = (cmd or "").strip().split(None, 2)
+        # parts[0] == "/subgoal"; remainder is what the user typed
+        arg = " ".join(parts[1:]).strip() if len(parts) > 1 else ""
+
+        mgr = self._get_goal_manager()
+        if mgr is None:
+            _cprint(f"  {_DIM}Goals unavailable (no active session).{_RST}")
+            return
+
+        if not mgr.has_goal():
+            _cprint(f"  {_DIM}No active goal. Set one with /goal <text>.{_RST}")
+            return
+
+        # No args → show the checklist.
+        if not arg:
+            _cprint(f"  {mgr.status_line()}")
+            _cprint(f"  {mgr.render_checklist()}")
+            return
+
+        tokens = arg.split(None, 1)
+        verb = tokens[0].lower()
+        rest = tokens[1].strip() if len(tokens) > 1 else ""
+
+        # Action verbs operate on indices.
+        action_status_map = {
+            "complete": "completed",
+            "completed": "completed",
+            "done": "completed",
+            "impossible": "impossible",
+            "imp": "impossible",
+            "skip": "impossible",
+            "undo": "pending",
+            "pending": "pending",
+            "reset": "pending",
+        }
+        if verb in action_status_map:
+            if not rest:
+                _cprint(f"  Usage: /subgoal {verb} <n>")
+                return
+            try:
+                idx = int(rest.split()[0])
+            except ValueError:
+                _cprint(f"  /subgoal {verb}: <n> must be an integer (1-based index).")
+                return
+            try:
+                item = mgr.mark_subgoal(idx, action_status_map[verb])
+            except (IndexError, ValueError, RuntimeError) as exc:
+                _cprint(f"  /subgoal {verb}: {exc}")
+                return
+            _cprint(f"  ✓ Item {idx} → {item.status}: {item.text}")
+            return
+
+        if verb == "remove":
+            if not rest:
+                _cprint("  Usage: /subgoal remove <n>")
+                return
+            try:
+                idx = int(rest.split()[0])
+            except ValueError:
+                _cprint("  /subgoal remove: <n> must be an integer (1-based index).")
+                return
+            try:
+                removed = mgr.remove_subgoal(idx)
+            except (IndexError, RuntimeError) as exc:
+                _cprint(f"  /subgoal remove: {exc}")
+                return
+            _cprint(f"  ✓ Removed item {idx}: {removed.text}")
+            return
+
+        if verb == "clear":
+            mgr.clear_checklist()
+            _cprint(
+                "  ✓ Checklist cleared. The judge will re-decompose on the next turn."
+            )
+            return
+
+        # Otherwise: append `arg` as a user-authored checklist item.
+        try:
+            item = mgr.add_subgoal(arg)
+        except (ValueError, RuntimeError) as exc:
+            _cprint(f"  /subgoal: {exc}")
+            return
+        idx = len(mgr.state.checklist) if mgr.state else 0
+        _cprint(f"  ✓ Added subgoal {idx}: {item.text}")
+
     def _maybe_continue_goal_after_turn(self) -> None:
         """Hook run after every CLI turn. Judges + maybe re-queues.
 
@@ -7917,7 +8016,11 @@ class HermesCLI:
         if not last_response.strip():
             return
 
-        decision = mgr.evaluate_after_turn(last_response, user_initiated=True)
+        decision = mgr.evaluate_after_turn(
+            last_response,
+            user_initiated=True,
+            messages=getattr(self, "conversation_history", None) or [],
+        )
         msg = decision.get("message") or ""
         if msg:
             _cprint(f"  {msg}")
diff --git a/gateway/run.py b/gateway/run.py
index 129827cbfb6..358e3702489 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -6063,6 +6063,12 @@ class GatewayRunner:
                     return await self._handle_goal_command(event)
                 return "Agent is running — use /goal status / pause / clear mid-run, or /stop before setting a new goal."
 
+            # /subgoal is safe mid-run — it only modifies the active goal's
+            # checklist, which the judge consults at turn boundaries. There
+            # is no race with the running turn.
+            if _cmd_def_inner and _cmd_def_inner.name == "subgoal":
+                return await self._handle_subgoal_command(event)
+
             # Session-level toggles that are safe to run mid-agent —
             # /yolo can unblock a pending approval prompt, /verbose cycles
             # the tool-progress display mode for the ongoing stream.
@@ -6442,6 +6448,9 @@ class GatewayRunner:
         if canonical == "goal":
             return await self._handle_goal_command(event)
 
+        if canonical == "subgoal":
+            return await self._handle_subgoal_command(event)
+
         if canonical == "voice":
             return await self._handle_voice_command(event)
 
@@ -6616,10 +6625,18 @@ class GatewayRunner:
                     except Exception:
                         session_entry = None
                     if session_entry is not None:
+                        # Pull the agent's full messages list from the result
+                        # so the judge can dump it for its read_file tool.
+                        _agent_messages: list = []
+                        if isinstance(_agent_result, dict):
+                            _msgs = _agent_result.get("messages")
+                            if isinstance(_msgs, list):
+                                _agent_messages = _msgs
                         await self._post_turn_goal_continuation(
                             session_entry=session_entry,
                             source=source,
                             final_response=_final_text,
+                            agent_messages=_agent_messages,
                         )
             except Exception as _goal_exc:
                 logger.debug("goal continuation hook failed: %s", _goal_exc)
@@ -9335,6 +9352,83 @@ class GatewayRunner:
 
         return t("gateway.goal.set", budget=state.max_turns, goal=state.goal)
 
+    async def _handle_subgoal_command(self, event: "MessageEvent") -> str:
+        """Handle /subgoal for gateway platforms.
+
+        Forms (mirror of CLI):
+          /subgoal                              show the checklist
+          /subgoal <text>                       append a user item
+          /subgoal complete <n> | done <n>      mark item n completed
+          /subgoal impossible <n>               mark item n impossible
+          /subgoal undo <n>                     revert item n to pending
+          /subgoal remove <n>                   delete item n
+          /subgoal clear                        wipe the checklist
+        """
+        args = (event.get_command_args() or "").strip()
+
+        mgr, _session_entry = self._get_goal_manager_for_event(event)
+        if mgr is None:
+            return t("gateway.goal.unavailable")
+
+        if not mgr.has_goal():
+            return "No active goal. Set one with /goal <text>."
+
+        if not args:
+            return f"{mgr.status_line()}\n{mgr.render_checklist()}"
+
+        tokens = args.split(None, 1)
+        verb = tokens[0].lower()
+        rest = tokens[1].strip() if len(tokens) > 1 else ""
+
+        action_status_map = {
+            "complete": "completed",
+            "completed": "completed",
+            "done": "completed",
+            "impossible": "impossible",
+            "imp": "impossible",
+            "skip": "impossible",
+            "undo": "pending",
+            "pending": "pending",
+            "reset": "pending",
+        }
+        if verb in action_status_map:
+            if not rest:
+                return f"Usage: /subgoal {verb} <n>"
+            try:
+                idx = int(rest.split()[0])
+            except ValueError:
+                return f"/subgoal {verb}: <n> must be an integer (1-based index)."
+            try:
+                item = mgr.mark_subgoal(idx, action_status_map[verb])
+            except (IndexError, ValueError, RuntimeError) as exc:
+                return f"/subgoal {verb}: {exc}"
+            return f"✓ Item {idx} → {item.status}: {item.text}"
+
+        if verb == "remove":
+            if not rest:
+                return "Usage: /subgoal remove <n>"
+            try:
+                idx = int(rest.split()[0])
+            except ValueError:
+                return "/subgoal remove: <n> must be an integer (1-based index)."
+            try:
+                removed = mgr.remove_subgoal(idx)
+            except (IndexError, RuntimeError) as exc:
+                return f"/subgoal remove: {exc}"
+            return f"✓ Removed item {idx}: {removed.text}"
+
+        if verb == "clear":
+            mgr.clear_checklist()
+            return "✓ Checklist cleared. The judge will re-decompose on the next turn."
+
+        # Otherwise — append `args` as a new user-authored checklist item.
+        try:
+            item = mgr.add_subgoal(args)
+        except (ValueError, RuntimeError) as exc:
+            return f"/subgoal: {exc}"
+        idx = len(mgr.state.checklist) if mgr.state else 0
+        return f"✓ Added subgoal {idx}: {item.text}"
+
     async def _send_goal_status_notice(self, source: Any, message: str) -> None:
         """Send a /goal judge status line back to the originating chat/thread."""
         adapter = self.adapters.get(source.platform)
@@ -9403,6 +9497,7 @@ class GatewayRunner:
         session_entry: Any,
         source: Any,
         final_response: str,
+        agent_messages: Optional[list] = None,
     ) -> None:
         """Run the goal judge after a gateway turn and, if still active,
         enqueue a continuation prompt for the same session.
@@ -9430,7 +9525,11 @@ class GatewayRunner:
         if not mgr.is_active():
             return
 
-        decision = mgr.evaluate_after_turn(final_response or "", user_initiated=True)
+        decision = mgr.evaluate_after_turn(
+            final_response or "",
+            user_initiated=True,
+            messages=agent_messages or [],
+        )
         msg = decision.get("message") or ""
 
         # Defer the status line until after the adapter has delivered the
diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py
index 3f543bba647..e2727c25bab 100644
--- a/hermes_cli/commands.py
+++ b/hermes_cli/commands.py
@@ -104,6 +104,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
                args_hint="<prompt>"),
     CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session",
                args_hint="[text | pause | resume | clear | status]"),
+    CommandDef("subgoal", "Add or manage checklist items on the active goal", "Session",
+               args_hint="[text | complete N | impossible N | undo N | remove N | clear]"),
     CommandDef("status", "Show session info", "Session"),
     CommandDef("whoami", "Show your slash command access (admin / user)", "Info"),
     CommandDef("profile", "Show active profile name and home directory", "Info"),
diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py
index 894cdddb01b..4f54e74f66f 100644
--- a/hermes_cli/goals.py
+++ b/hermes_cli/goals.py
@@ -7,6 +7,18 @@ continuation prompt back into the same session and keeps working until the
 goal is done, turn budget is exhausted, the user pauses/clears it, or the
 user sends a new message (which takes priority and pauses the goal loop).
 
+Checklist mode (added 2026-05): when a goal is set, a Phase-A "decompose"
+call asks the judge to write an extremely detailed checklist of concrete
+completion criteria for that goal. On every subsequent turn (Phase B) the
+judge evaluates the agent's most recent output against EACH pending item
+and may flip pending → completed | impossible, or append new items it
+discovers along the way. The goal is done only when every checklist item
+is in a terminal status. This is much harsher than the freeform
+"is the goal done?" prompt and gives users a visible, verifiable progress
+surface via /subgoal. A bounded read_file tool loop lets the judge inspect
+the dumped conversation history when the snippet alone isn't enough to
+rule.
+
 State is persisted in SessionDB's ``state_meta`` table keyed by
 ``goal:<session_id>`` so ``/resume`` picks it up.
 
@@ -21,6 +33,9 @@ Design notes / invariants:
   prompt and also pauses the goal loop for that turn (we still re-judge
   after, so if the user's message happens to complete the goal the judge
   will say ``done``).
+- Stickiness: once an item is marked completed or impossible, only the user
+  (via /subgoal undo) can flip it back. Judge updates that try to regress
+  terminal items are silently ignored.
 - This module has zero hard dependency on ``cli.HermesCLI`` or the gateway
   runner — both wire the same ``GoalManager`` in.
 
@@ -31,10 +46,12 @@ from __future__ import annotations
 
 import json
 import logging
+import os
 import re
 import time
-from dataclasses import dataclass, asdict
-from typing import Any, Dict, Optional, Tuple
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
 
 logger = logging.getLogger(__name__)
 
@@ -44,8 +61,9 @@ logger = logging.getLogger(__name__)
 # ──────────────────────────────────────────────────────────────────────
 
 DEFAULT_MAX_TURNS = 20
-DEFAULT_JUDGE_TIMEOUT = 30.0
-# Cap how much of the last response + recent messages we send to the judge.
+DEFAULT_JUDGE_TIMEOUT = 60.0
+# Cap how much of the last response we send to the judge inline. The judge
+# can read the dumped conversation file via read_file if it needs more.
 _JUDGE_RESPONSE_SNIPPET_CHARS = 4000
 # After this many consecutive judge *parse* failures (empty output / non-JSON),
 # the loop auto-pauses and points the user at the goal_judge config. API /
@@ -55,8 +73,36 @@ _JUDGE_RESPONSE_SNIPPET_CHARS = 4000
 # exhausted with every reply shaped like `judge returned empty response` or
 # `judge reply was not JSON`.
 DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3
+# Bound the Phase-B judge tool loop: if the judge keeps calling read_file
+# without ever emitting a verdict, cap it so we don't burn the model's budget.
+DEFAULT_MAX_JUDGE_TOOL_CALLS = 5
+# Cap a single read_file response so a judge that tries to read 100k lines
+# doesn't blow up its own context. Judge can paginate if needed.
+_JUDGE_READ_FILE_MAX_LINES = 400
+_JUDGE_READ_FILE_MAX_CHARS = 32_000
 
 
+# Status constants ────────────────────────────────────────────────────
+ITEM_PENDING = "pending"
+ITEM_COMPLETED = "completed"
+ITEM_IMPOSSIBLE = "impossible"
+TERMINAL_ITEM_STATUSES = frozenset({ITEM_COMPLETED, ITEM_IMPOSSIBLE})
+VALID_ITEM_STATUSES = frozenset({ITEM_PENDING, ITEM_COMPLETED, ITEM_IMPOSSIBLE})
+
+ITEM_MARKERS = {
+    ITEM_COMPLETED: "[x]",
+    ITEM_IMPOSSIBLE: "[!]",
+    ITEM_PENDING: "[ ]",
+}
+
+ADDED_BY_JUDGE = "judge"
+ADDED_BY_USER = "user"
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Continuation prompt
+# ──────────────────────────────────────────────────────────────────────
+
 CONTINUATION_PROMPT_TEMPLATE = (
     "[Continuing toward your standing goal]\n"
     "Goal: {goal}\n\n"
@@ -65,8 +111,56 @@ CONTINUATION_PROMPT_TEMPLATE = (
     "If you are blocked and need input from the user, say so clearly and stop."
 )
 
+CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE = (
+    "[Continuing toward your standing goal]\n"
+    "Goal: {goal}\n\n"
+    "Checklist progress ({done}/{total} done):\n"
+    "{checklist}\n\n"
+    "Work on the unchecked items above. Do not declare items done yourself "
+    "— a judge marks them based on evidence in your output. If an item is "
+    "genuinely impossible in this environment, explain why so the judge can "
+    "mark it impossible. If you are blocked on a remaining item and need "
+    "user input, say so clearly and stop."
+)
 
-JUDGE_SYSTEM_PROMPT = (
+
+# ──────────────────────────────────────────────────────────────────────
+# Phase-A: decompose prompts
+# ──────────────────────────────────────────────────────────────────────
+
+DECOMPOSE_SYSTEM_PROMPT = (
+    "You are a strict judge for an autonomous agent. Your first job, before "
+    "judging anything, is to break the user's stated goal into an EXTREMELY "
+    "DETAILED checklist of concrete, verifiable completion criteria. Each "
+    "item must be specific enough that a third party reading the agent's "
+    "output could decide unambiguously whether that item was achieved.\n\n"
+    "Be exhaustive. Bias toward MORE items, not fewer. Include sub-items, "
+    "edge cases, quality bars, deployment steps, verification checks, and "
+    "anything the user would reasonably expect from a goal of this type. "
+    "If the user said 'build me a website' you should be enumerating "
+    "homepage exists, navigation links work, content is non-placeholder, "
+    "mobile responsive, accessibility tags present, deployed somewhere "
+    "publicly accessible, domain/URL is functional, etc. Better to "
+    "over-specify and let a few items get marked impossible than to "
+    "under-specify and let the agent declare victory early.\n\n"
+    "Reply ONLY with a single JSON object on one line:\n"
+    '{"checklist": [{"text": "<item>"}, {"text": "<item>"}, ...]}'
+)
+
+DECOMPOSE_USER_PROMPT_TEMPLATE = (
+    "Goal:\n{goal}\n\n"
+    "Produce the harshest, most detailed checklist of completion criteria "
+    "you can. Aim for at least 5 items; more is better when warranted. "
+    "Each item should be a single verifiable statement of fact about the "
+    "finished work."
+)
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Phase-B: evaluate prompts
+# ──────────────────────────────────────────────────────────────────────
+
+EVALUATE_SYSTEM_PROMPT_FREEFORM = (
     "You are a strict judge evaluating whether an autonomous agent has "
     "achieved a user's stated goal. You receive the goal text and the "
     "agent's most recent response. Your only job is to decide whether "
@@ -78,11 +172,60 @@ JUDGE_SYSTEM_PROMPT = (
     "user input (treat this as DONE with reason describing the block).\n\n"
     "Otherwise the goal is NOT done — CONTINUE.\n\n"
     "Reply ONLY with a single JSON object on one line:\n"
-    '{\"done\": <true|false>, \"reason\": \"<one-sentence rationale>\"}'
+    '{"done": <true|false>, "reason": "<one-sentence rationale>"}'
 )
 
+EVALUATE_SYSTEM_PROMPT_CHECKLIST = (
+    "You are a strict judge evaluating an autonomous agent's progress on "
+    "a user's goal that has a detailed checklist of completion criteria. "
+    "For EACH currently-pending checklist item, decide whether the "
+    "available evidence shows the item is satisfied.\n\n"
+    "Be strict but not absurd. Default to leaving items pending UNLESS "
+    "evidence is reasonably clear. Reasonable evidence includes:\n"
+    "- The agent's most recent response describing or showing the work\n"
+    "- Tool call results visible in the conversation history (file writes, "
+    "command output, web requests, etc.)\n"
+    "- A clear statement by the agent that the work was done, when "
+    "supported by tool output earlier in the conversation\n\n"
+    "Do NOT require the agent to re-prove items it has already established "
+    "in earlier turns. If a tool call earlier in the conversation already "
+    "wrote a file, you do not need fresh `ls` output every turn — once "
+    "established, it's done.\n\n"
+    "Flip pending → completed when the response or recent tool calls show "
+    "the item is satisfied. Flip pending → impossible only when the work "
+    "demonstrates the item cannot be achieved in this environment (NOT "
+    "merely that the agent didn't try). Vague intentions ('I will do X "
+    "next') do NOT count as completion.\n\n"
+    "You may APPEND new checklist items if the agent's work reveals "
+    "criteria the original decomposition missed. Stay strict — only add "
+    "items that genuinely belong as completion criteria.\n\n"
+    "STICKINESS: items already marked completed or impossible are frozen. "
+    "Do not include them in your updates. Only the user can revert them.\n\n"
+    "TOOLS: you have read_file(path, offset, limit) available. The full "
+    "conversation history for this session is dumped to a JSON file whose "
+    "path is given in the user message. Call read_file on it when the "
+    "snippet is ambiguous, when you need to verify a tool call actually "
+    "happened, or when you want to see what skills the agent loaded. "
+    "Otherwise, judge from the snippet directly — extra reads cost tokens.\n\n"
+    "When you are ready to rule, reply ONLY with a single JSON object:\n"
+    '{"updates": [{"index": <i>, "status": "completed|impossible", "evidence": "<why>"}, ...], '
+    '"new_items": [{"text": "<new item>"}, ...], '
+    '"reason": "<one-sentence overall rationale>"}\n'
+    "When citing evidence, reference the agent's actual output specifically. "
+    "Empty updates is fine. Empty new_items is fine. The reason field is required."
+)
 
-JUDGE_USER_PROMPT_TEMPLATE = (
+EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE = (
+    "Goal:\n{goal}\n\n"
+    "Current checklist (each item is numbered, 1-based — use these "
+    "exact 1-based numbers as the ``index`` field in your updates):\n{checklist_block}\n\n"
+    "Agent's most recent response (snippet):\n{response}\n\n"
+    "Conversation history file (call read_file on this path if you need "
+    "more context — pagination supported via offset/limit):\n{history_path}\n\n"
+    "Evaluate each pending item. Cite specific evidence."
+)
+
+EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE = (
     "Goal:\n{goal}\n\n"
     "Agent's most recent response:\n{response}\n\n"
     "Is the goal satisfied?"
@@ -90,16 +233,55 @@ JUDGE_USER_PROMPT_TEMPLATE = (
 
 
 # ──────────────────────────────────────────────────────────────────────
-# Dataclass
+# Dataclasses
 # ──────────────────────────────────────────────────────────────────────
 
 
+@dataclass
+class ChecklistItem:
+    """One concrete completion criterion attached to a goal."""
+
+    text: str
+    status: str = ITEM_PENDING            # pending | completed | impossible
+    added_by: str = ADDED_BY_JUDGE        # judge | user
+    added_at: float = 0.0
+    completed_at: Optional[float] = None
+    evidence: Optional[str] = None        # judge's rationale on flip
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ChecklistItem":
+        text = str(data.get("text", "")).strip()
+        if not text:
+            text = "(empty item)"
+        status = str(data.get("status", ITEM_PENDING)).strip().lower()
+        if status not in VALID_ITEM_STATUSES:
+            status = ITEM_PENDING
+        added_by = str(data.get("added_by", ADDED_BY_JUDGE)).strip().lower()
+        if added_by not in (ADDED_BY_JUDGE, ADDED_BY_USER):
+            added_by = ADDED_BY_JUDGE
+        return cls(
+            text=text,
+            status=status,
+            added_by=added_by,
+            added_at=float(data.get("added_at", 0.0) or 0.0),
+            completed_at=(
+                float(data["completed_at"])
+                if data.get("completed_at") is not None
+                else None
+            ),
+            evidence=data.get("evidence"),
+        )
+
+
 @dataclass
 class GoalState:
     """Serializable goal state stored per session."""
 
     goal: str
-    status: str = "active"          # active | paused | done | cleared
+    status: str = "active"                    # active | paused | done | cleared
     turns_used: int = 0
     max_turns: int = DEFAULT_MAX_TURNS
     created_at: float = 0.0
@@ -108,13 +290,28 @@ class GoalState:
     last_reason: Optional[str] = None
     paused_reason: Optional[str] = None       # why we auto-paused (budget, etc.)
     consecutive_parse_failures: int = 0       # judge-output parse failures in a row
+    # Checklist mode (added 2026-05). Both fields default safely so old
+    # state_meta rows load unchanged.
+    checklist: List[ChecklistItem] = field(default_factory=list)
+    decomposed: bool = False                  # has Phase-A run for this goal?
 
     def to_json(self) -> str:
-        return json.dumps(asdict(self), ensure_ascii=False)
+        data = asdict(self)
+        # asdict already serializes ChecklistItem via dataclass recursion.
+        return json.dumps(data, ensure_ascii=False)
 
     @classmethod
     def from_json(cls, raw: str) -> "GoalState":
         data = json.loads(raw)
+        raw_checklist = data.get("checklist") or []
+        checklist: List[ChecklistItem] = []
+        if isinstance(raw_checklist, list):
+            for item in raw_checklist:
+                if isinstance(item, dict):
+                    try:
+                        checklist.append(ChecklistItem.from_dict(item))
+                    except Exception:
+                        continue
         return cls(
             goal=data.get("goal", ""),
             status=data.get("status", "active"),
@@ -126,8 +323,39 @@ class GoalState:
             last_reason=data.get("last_reason"),
             paused_reason=data.get("paused_reason"),
             consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0),
+            checklist=checklist,
+            decomposed=bool(data.get("decomposed", False)),
         )
 
+    # --- checklist helpers ------------------------------------------------
+
+    def checklist_counts(self) -> Tuple[int, int, int, int]:
+        """Return (total, completed, impossible, pending)."""
+        total = len(self.checklist)
+        completed = sum(1 for it in self.checklist if it.status == ITEM_COMPLETED)
+        impossible = sum(1 for it in self.checklist if it.status == ITEM_IMPOSSIBLE)
+        pending = total - completed - impossible
+        return total, completed, impossible, pending
+
+    def all_terminal(self) -> bool:
+        """True iff at least one item exists and every item is in a terminal status."""
+        if not self.checklist:
+            return False
+        return all(it.status in TERMINAL_ITEM_STATUSES for it in self.checklist)
+
+    def render_checklist(self, *, numbered: bool = False) -> str:
+        if not self.checklist:
+            return "(empty)"
+        lines = []
+        for i, item in enumerate(self.checklist, start=1):
+            marker = ITEM_MARKERS.get(item.status, "[?]")
+            prefix = f"{i}. {marker}" if numbered else f"  {marker}"
+            line = f"{prefix} {item.text}"
+            if item.status == ITEM_IMPOSSIBLE and item.evidence:
+                line += f" (impossible: {item.evidence})"
+            lines.append(line)
+        return "\n".join(lines)
+
 
 # ──────────────────────────────────────────────────────────────────────
 # Persistence (SessionDB state_meta)
@@ -215,7 +443,63 @@ def clear_goal(session_id: str) -> None:
 
 
 # ──────────────────────────────────────────────────────────────────────
-# Judge
+# Conversation-history dump (read by the judge tool loop)
+# ──────────────────────────────────────────────────────────────────────
+
+
+def _goals_dump_dir() -> Optional[Path]:
+    """Return ``<HERMES_HOME>/goals`` (created on first use), or None on error."""
+    try:
+        from hermes_constants import get_hermes_home
+
+        home = Path(get_hermes_home())
+    except Exception as exc:
+        logger.debug("goals dump dir: get_hermes_home failed: %s", exc)
+        return None
+    try:
+        path = home / "goals"
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+    except Exception as exc:
+        logger.debug("goals dump dir: mkdir failed: %s", exc)
+        return None
+
+
+def _safe_session_filename(session_id: str) -> str:
+    """Make a session_id safe for use as a filename component."""
+    cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", session_id or "unknown")
+    # Bound length to keep filesystem happy.
+    return cleaned[:128] or "unknown"
+
+
+def conversation_dump_path(session_id: str) -> Optional[Path]:
+    """Where the dumped messages JSON for ``session_id`` lives."""
+    base = _goals_dump_dir()
+    if base is None:
+        return None
+    return base / f"{_safe_session_filename(session_id)}.json"
+
+
+def dump_conversation(session_id: str, messages: List[Dict[str, Any]]) -> Optional[Path]:
+    """Write ``messages`` to the goals/ dump file. Returns the path on success."""
+    if not session_id or not messages:
+        return None
+    path = conversation_dump_path(session_id)
+    if path is None:
+        return None
+    try:
+        # Best-effort: messages may contain non-JSON-serializable objects from
+        # provider-specific adapter shims. Fall through with default=str.
+        with open(path, "w", encoding="utf-8") as fh:
+            json.dump(messages, fh, ensure_ascii=False, indent=2, default=str)
+        return path
+    except Exception as exc:
+        logger.debug("dump_conversation: write failed: %s", exc)
+        return None
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Judge: parsing helpers
 # ──────────────────────────────────────────────────────────────────────
 
 
@@ -227,11 +511,34 @@ def _truncate(text: str, limit: int) -> str:
     return text[:limit] + "… [truncated]"
 
 
-_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)
+_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL)
+
+
+def _extract_json_object(raw: str) -> Optional[Dict[str, Any]]:
+    """Best-effort extraction of a single JSON object from a possibly-prosey reply."""
+    if not raw:
+        return None
+    text = raw.strip()
+    if text.startswith("```"):
+        text = text.strip("`")
+        nl = text.find("\n")
+        if nl != -1:
+            text = text[nl + 1:]
+    try:
+        data = json.loads(text)
+    except Exception:
+        match = _JSON_OBJECT_RE.search(text)
+        if not match:
+            return None
+        try:
+            data = json.loads(match.group(0))
+        except Exception:
+            return None
+    return data if isinstance(data, dict) else None
 
 
 def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
-    """Parse the judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.
+    """Parse the freeform judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.
 
     Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the
     judge returned output that couldn't be interpreted as the expected JSON
@@ -242,30 +549,8 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
     if not raw:
         return False, "judge returned empty response", True
 
-    text = raw.strip()
-
-    # Strip markdown code fences the model may wrap JSON in.
-    if text.startswith("```"):
-        text = text.strip("`")
-        # Peel off leading json/JSON/etc tag
-        nl = text.find("\n")
-        if nl != -1:
-            text = text[nl + 1:]
-
-    # First try: parse the whole blob.
-    data: Optional[Dict[str, Any]] = None
-    try:
-        data = json.loads(text)
-    except Exception:
-        # Second try: pull the first JSON object out.
-        match = _JSON_OBJECT_RE.search(text)
-        if match:
-            try:
-                data = json.loads(match.group(0))
-            except Exception:
-                data = None
-
-    if not isinstance(data, dict):
+    data = _extract_json_object(raw)
+    if data is None:
         return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True
 
     done_val = data.get("done")
@@ -279,49 +564,286 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
     return done, reason, False
 
 
-def judge_goal(
+def _parse_decompose_response(raw: str) -> Tuple[List[Dict[str, Any]], bool]:
+    """Parse a Phase-A decompose reply. Returns (items, parse_failed)."""
+    if not raw:
+        return [], True
+    data = _extract_json_object(raw)
+    if data is None:
+        return [], True
+    raw_items = data.get("checklist")
+    if not isinstance(raw_items, list):
+        return [], True
+    out: List[Dict[str, Any]] = []
+    for item in raw_items:
+        if isinstance(item, dict):
+            text = str(item.get("text", "")).strip()
+            if text:
+                out.append({"text": text})
+        elif isinstance(item, str):
+            text = item.strip()
+            if text:
+                out.append({"text": text})
+    return out, False
+
+
+def _parse_evaluate_response(raw: str) -> Tuple[Dict[str, Any], bool]:
+    """Parse a Phase-B checklist eval reply. Returns (parsed, parse_failed).
+
+    parsed = {"updates": [...], "new_items": [...], "reason": str}
+    """
+    if not raw:
+        return {"updates": [], "new_items": [], "reason": "judge returned empty response"}, True
+    data = _extract_json_object(raw)
+    if data is None:
+        return (
+            {
+                "updates": [],
+                "new_items": [],
+                "reason": f"judge reply was not JSON: {_truncate(raw, 200)!r}",
+            },
+            True,
+        )
+    updates = data.get("updates") or []
+    new_items = data.get("new_items") or []
+    reason = str(data.get("reason") or "").strip() or "no reason provided"
+    norm_updates = []
+    if isinstance(updates, list):
+        for upd in updates:
+            if not isinstance(upd, dict):
+                continue
+            try:
+                # Judge sees the checklist rendered with 1-based indices
+                # (matches the /subgoal CLI). Convert to 0-based here so the
+                # apply layer can index ``state.checklist`` directly.
+                idx_1based = int(upd.get("index"))
+            except (TypeError, ValueError):
+                continue
+            idx = idx_1based - 1
+            status = str(upd.get("status", "")).strip().lower()
+            if status not in TERMINAL_ITEM_STATUSES:
+                # Phase-B only accepts terminal flips. Pending → pending is a no-op.
+                continue
+            evidence = str(upd.get("evidence") or "").strip() or None
+            norm_updates.append({"index": idx, "status": status, "evidence": evidence})
+    norm_new = []
+    if isinstance(new_items, list):
+        for it in new_items:
+            if isinstance(it, dict):
+                text = str(it.get("text", "")).strip()
+                if text:
+                    norm_new.append({"text": text})
+            elif isinstance(it, str):
+                text = it.strip()
+                if text:
+                    norm_new.append({"text": text})
+    return {"updates": norm_updates, "new_items": norm_new, "reason": reason}, False
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Judge: read_file tool for the judge's bounded tool loop
+# ──────────────────────────────────────────────────────────────────────
+
+
+_JUDGE_READ_FILE_TOOL_SCHEMA: Dict[str, Any] = {
+    "type": "function",
+    "function": {
+        "name": "read_file",
+        "description": (
+            "Read a portion of the dumped conversation history JSON file. "
+            "Use this when the snippet alone isn't enough to rule. Returns "
+            "lines from the file with 1-based line numbers. Pagination "
+            "supported via offset and limit. Reads beyond a built-in cap "
+            "are truncated."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": (
+                        "Absolute path to the conversation history file. "
+                        "You were given this in the user message."
+                    ),
+                },
+                "offset": {
+                    "type": "integer",
+                    "description": "1-indexed starting line number (default 1).",
+                    "default": 1,
+                },
+                "limit": {
+                    "type": "integer",
+                    "description": (
+                        f"Max lines to return (default {_JUDGE_READ_FILE_MAX_LINES})."
+                    ),
+                    "default": _JUDGE_READ_FILE_MAX_LINES,
+                },
+            },
+            "required": ["path"],
+        },
+    },
+}
+
+
+def _judge_read_file(
+    path: str,
+    *,
+    offset: int = 1,
+    limit: int = _JUDGE_READ_FILE_MAX_LINES,
+    allowed_path: Optional[Path] = None,
+) -> str:
+    """Bounded read of the dumped conversation file. Returns JSON-serializable text.
+
+    Restricted to ``allowed_path`` when provided — the judge cannot use this
+    tool to read arbitrary files.
+    """
+    if not path:
+        return json.dumps({"error": "path is required"})
+    try:
+        target = Path(path).resolve()
+    except Exception as exc:
+        return json.dumps({"error": f"path resolve failed: {exc}"})
+
+    if allowed_path is not None:
+        try:
+            allowed = allowed_path.resolve()
+        except Exception:
+            allowed = allowed_path
+        if target != allowed:
+            return json.dumps({
+                "error": (
+                    f"read_file is restricted to the conversation dump path. "
+                    f"Allowed: {allowed}"
+                )
+            })
+
+    if not target.exists():
+        return json.dumps({"error": f"file not found: {target}"})
+    try:
+        offset = max(1, int(offset or 1))
+        limit = max(1, min(int(limit or _JUDGE_READ_FILE_MAX_LINES), _JUDGE_READ_FILE_MAX_LINES))
+    except (TypeError, ValueError):
+        return json.dumps({"error": "offset and limit must be integers"})
+
+    try:
+        with open(target, "r", encoding="utf-8", errors="replace") as fh:
+            lines = fh.readlines()
+    except Exception as exc:
+        return json.dumps({"error": f"read failed: {exc}"})
+
+    total = len(lines)
+    start = offset - 1
+    end = min(start + limit, total)
+    slice_lines = lines[start:end]
+    out = "".join(slice_lines)
+    if len(out) > _JUDGE_READ_FILE_MAX_CHARS:
+        out = out[:_JUDGE_READ_FILE_MAX_CHARS] + "\n… [truncated by judge read cap]"
+    return json.dumps({
+        "path": str(target),
+        "total_lines": total,
+        "offset": offset,
+        "returned": len(slice_lines),
+        "next_offset": end + 1 if end < total else None,
+        "content": out,
+    }, ensure_ascii=False)
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Judge: phase-A (decompose) and phase-B (evaluate)
+# ──────────────────────────────────────────────────────────────────────
+
+
+def _get_judge_client() -> Tuple[Optional[Any], str]:
+    """Return (client, model) or (None, '') when unavailable."""
+    try:
+        from agent.auxiliary_client import get_text_auxiliary_client
+    except Exception as exc:
+        logger.debug("goal judge: auxiliary client import failed: %s", exc)
+        return None, ""
+    try:
+        client, model = get_text_auxiliary_client("goal_judge")
+    except Exception as exc:
+        logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc)
+        return None, ""
+    if client is None or not model:
+        return None, ""
+    return client, model
+
+
+def decompose_goal(
+    goal: str,
+    *,
+    timeout: float = DEFAULT_JUDGE_TIMEOUT,
+) -> Tuple[List[Dict[str, Any]], Optional[str]]:
+    """Phase-A: ask the judge to break the goal into a checklist.
+
+    Returns ``(items, error)``. On any failure, returns ``([], reason)`` so
+    the caller can decide whether to fall back to freeform mode.
+    """
+    if not goal.strip():
+        return [], "empty goal"
+
+    client, model = _get_judge_client()
+    if client is None:
+        return [], "auxiliary client unavailable"
+
+    try:
+        resp = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": DECOMPOSE_SYSTEM_PROMPT},
+                {
+                    "role": "user",
+                    "content": DECOMPOSE_USER_PROMPT_TEMPLATE.format(
+                        goal=_truncate(goal, 4000)
+                    ),
+                },
+            ],
+            temperature=0,
+            max_tokens=2000,
+            timeout=timeout,
+        )
+    except Exception as exc:
+        logger.info("goal decompose: API call failed (%s)", exc)
+        return [], f"decompose error: {type(exc).__name__}"
+
+    try:
+        raw = resp.choices[0].message.content or ""
+    except Exception:
+        raw = ""
+
+    items, parse_failed = _parse_decompose_response(raw)
+    if parse_failed or not items:
+        logger.info(
+            "goal decompose: parse failed or empty checklist (raw=%r)",
+            _truncate(raw, 200),
+        )
+        return [], "decompose parse failed or empty"
+    logger.info("goal decompose: produced %d checklist items", len(items))
+    return items, None
+
+
+def judge_goal_freeform(
     goal: str,
     last_response: str,
     *,
     timeout: float = DEFAULT_JUDGE_TIMEOUT,
 ) -> Tuple[str, str, bool]:
-    """Ask the auxiliary model whether the goal is satisfied.
+    """Legacy freeform judge — kept for goals with no checklist.
 
     Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``,
-    ``"continue"``, or ``"skipped"`` (when the judge couldn't be reached).
-
-    ``parse_failed`` is True only when the judge call succeeded but its output
-    was unusable (empty or non-JSON). API/transport errors return False — they
-    are transient and should fail-open silently. Callers use this flag to
-    auto-pause after N consecutive parse failures (see
-    ``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``).
-
-    This is deliberately fail-open: any error returns ``("continue", "...", False)``
-    so a broken judge doesn't wedge progress — the turn budget and the
-    consecutive-parse-failures auto-pause are the backstops.
+    ``"continue"``, or ``"skipped"``.
     """
     if not goal.strip():
         return "skipped", "empty goal", False
     if not last_response.strip():
-        # No substantive reply this turn — almost certainly not done yet.
         return "continue", "empty response (nothing to evaluate)", False
 
-    try:
-        from agent.auxiliary_client import get_text_auxiliary_client
-    except Exception as exc:
-        logger.debug("goal judge: auxiliary client import failed: %s", exc)
+    client, model = _get_judge_client()
+    if client is None:
         return "continue", "auxiliary client unavailable", False
 
-    try:
-        client, model = get_text_auxiliary_client("goal_judge")
-    except Exception as exc:
-        logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc)
-        return "continue", "auxiliary client unavailable", False
-
-    if client is None or not model:
-        return "continue", "no auxiliary client configured", False
-
-    prompt = JUDGE_USER_PROMPT_TEMPLATE.format(
+    prompt = EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE.format(
         goal=_truncate(goal, 2000),
         response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS),
     )
@@ -330,7 +852,7 @@ def judge_goal(
         resp = client.chat.completions.create(
             model=model,
             messages=[
-                {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
+                {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_FREEFORM},
                 {"role": "user", "content": prompt},
             ],
             temperature=0,
@@ -348,10 +870,178 @@ def judge_goal(
 
     done, reason, parse_failed = _parse_judge_response(raw)
     verdict = "done" if done else "continue"
-    logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120))
+    logger.info("goal judge (freeform): verdict=%s reason=%s", verdict, _truncate(reason, 120))
     return verdict, reason, parse_failed
 
 
+def evaluate_checklist(
+    state: GoalState,
+    last_response: str,
+    *,
+    history_path: Optional[Path],
+    timeout: float = DEFAULT_JUDGE_TIMEOUT,
+    max_tool_calls: int = DEFAULT_MAX_JUDGE_TOOL_CALLS,
+) -> Tuple[Dict[str, Any], bool]:
+    """Phase-B: judge evaluates each pending checklist item.
+
+    Runs a bounded tool loop so the judge can call ``read_file`` on the
+    dumped conversation history when the snippet isn't enough.
+
+    Returns ``(parsed, parse_failed)`` where parsed is
+    ``{"updates": [...], "new_items": [...], "reason": str}``.
+    Falls open on transport errors: empty updates/new_items, parse_failed=False.
+    """
+    client, model = _get_judge_client()
+    if client is None:
+        return ({"updates": [], "new_items": [], "reason": "auxiliary client unavailable"}, False)
+
+    # Render checklist with 1-based indices the judge can address.
+    checklist_block = state.render_checklist(numbered=True)
+
+    user_prompt = EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE.format(
+        goal=_truncate(state.goal, 2000),
+        checklist_block=checklist_block,
+        response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS),
+        history_path=str(history_path) if history_path else "(unavailable — judge from snippet only)",
+    )
+
+    messages: List[Dict[str, Any]] = [
+        {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_CHECKLIST},
+        {"role": "user", "content": user_prompt},
+    ]
+
+    # Some auxiliary providers may not support tool calls. We pass tools
+    # optimistically; if the provider returns a verdict directly without
+    # using them, we just parse it.
+    tools = [_JUDGE_READ_FILE_TOOL_SCHEMA] if history_path is not None else None
+
+    tool_calls_left = max(0, int(max_tool_calls))
+    final_raw = ""
+
+    for _ in range(tool_calls_left + 1):
+        try:
+            kwargs: Dict[str, Any] = {
+                "model": model,
+                "messages": messages,
+                "temperature": 0,
+                "max_tokens": 1500,
+                "timeout": timeout,
+            }
+            if tools:
+                kwargs["tools"] = tools
+                kwargs["tool_choice"] = "auto"
+            resp = client.chat.completions.create(**kwargs)
+        except Exception as exc:
+            logger.info("goal judge (checklist): API call failed (%s)", exc)
+            return (
+                {
+                    "updates": [],
+                    "new_items": [],
+                    "reason": f"judge error: {type(exc).__name__}",
+                },
+                False,
+            )
+
+        try:
+            choice = resp.choices[0]
+            msg = choice.message
+        except Exception:
+            return (
+                {"updates": [], "new_items": [], "reason": "judge response malformed"},
+                True,
+            )
+
+        # Unpack tool_calls in a way that works for openai-py and other shims.
+        tool_calls = getattr(msg, "tool_calls", None) or []
+        content = getattr(msg, "content", "") or ""
+
+        if not tool_calls:
+            final_raw = content
+            break
+
+        if tool_calls_left <= 0:
+            # Out of budget. Force a final ruling on the next pass by
+            # appending a system note and disabling tools.
+            messages.append({
+                "role": "user",
+                "content": (
+                    "You have exhausted your read_file budget. Issue your "
+                    "final JSON verdict now without calling more tools."
+                ),
+            })
+            tools = None
+            continue
+
+        # Append the assistant turn, then handle each tool call.
+        assistant_record: Dict[str, Any] = {
+            "role": "assistant",
+            "content": content,
+            "tool_calls": [],
+        }
+        for tc in tool_calls:
+            try:
+                tc_id = getattr(tc, "id", None) or "tc-?"
+                fn = getattr(tc, "function", None)
+                fn_name = getattr(fn, "name", "") if fn is not None else ""
+                fn_args = getattr(fn, "arguments", "") if fn is not None else ""
+                assistant_record["tool_calls"].append({
+                    "id": tc_id,
+                    "type": "function",
+                    "function": {"name": fn_name, "arguments": fn_args},
+                })
+            except Exception:
+                continue
+        messages.append(assistant_record)
+
+        for tc in tool_calls:
+            try:
+                tc_id = getattr(tc, "id", None) or "tc-?"
+                fn = getattr(tc, "function", None)
+                fn_name = getattr(fn, "name", "") if fn is not None else ""
+                fn_args_raw = getattr(fn, "arguments", "") if fn is not None else ""
+            except Exception:
+                continue
+            try:
+                args = json.loads(fn_args_raw) if isinstance(fn_args_raw, str) else (fn_args_raw or {})
+            except Exception:
+                args = {}
+            if fn_name == "read_file":
+                tool_result = _judge_read_file(
+                    str(args.get("path", "")),
+                    offset=args.get("offset", 1),
+                    limit=args.get("limit", _JUDGE_READ_FILE_MAX_LINES),
+                    allowed_path=history_path,
+                )
+            else:
+                tool_result = json.dumps({"error": f"unknown tool: {fn_name}"})
+            messages.append({
+                "role": "tool",
+                "tool_call_id": tc_id,
+                "name": fn_name,
+                "content": tool_result,
+            })
+            tool_calls_left -= 1
+
+        if tool_calls_left <= 0:
+            messages.append({
+                "role": "user",
+                "content": (
+                    "You have exhausted your read_file budget. Issue your "
+                    "final JSON verdict now without calling more tools."
+                ),
+            })
+            tools = None
+
+    parsed, parse_failed = _parse_evaluate_response(final_raw)
+    logger.info(
+        "goal judge (checklist): updates=%d new_items=%d reason=%s",
+        len(parsed.get("updates") or []),
+        len(parsed.get("new_items") or []),
+        _truncate(parsed.get("reason", ""), 120),
+    )
+    return parsed, parse_failed
+
+
 # ──────────────────────────────────────────────────────────────────────
 # GoalManager — the orchestration surface CLI + gateway talk to
 # ──────────────────────────────────────────────────────────────────────
@@ -368,8 +1058,12 @@ class GoalManager:
     - ``clear()`` — remove the active goal.
     - ``pause()`` / ``resume()`` — explicit user controls.
     - ``status()`` — printable one-liner.
-    - ``evaluate_after_turn(last_response)`` — call the judge, update state,
-      and return a decision dict the caller uses to drive the next turn.
+    - ``add_subgoal(text)`` — user appends a checklist item.
+    - ``mark_subgoal(index, status)`` — user flips an item (override).
+    - ``remove_subgoal(index)`` — user deletes an item.
+    - ``clear_checklist()`` — user wipes the checklist; next turn re-decomposes.
+    - ``evaluate_after_turn(last_response, agent=None)`` — call the judge,
+      update state, return a decision dict.
     - ``next_continuation_prompt()`` — the canonical user-role message to
       feed back into ``run_conversation``.
     """
@@ -396,14 +1090,26 @@ class GoalManager:
         if s is None or s.status in ("cleared",):
             return "No active goal. Set one with /goal <text>."
         turns = f"{s.turns_used}/{s.max_turns} turns"
+        cl_total, cl_done, cl_imp, _ = s.checklist_counts()
+        cl_text = ""
+        if cl_total:
+            cl_text = f", {cl_done + cl_imp}/{cl_total} done"
         if s.status == "active":
-            return f"⊙ Goal (active, {turns}): {s.goal}"
+            return f"⊙ Goal (active, {turns}{cl_text}): {s.goal}"
         if s.status == "paused":
             extra = f" — {s.paused_reason}" if s.paused_reason else ""
-            return f"⏸ Goal (paused, {turns}{extra}): {s.goal}"
+            return f"⏸ Goal (paused, {turns}{cl_text}{extra}): {s.goal}"
         if s.status == "done":
-            return f"✓ Goal done ({turns}): {s.goal}"
-        return f"Goal ({s.status}, {turns}): {s.goal}"
+            return f"✓ Goal done ({turns}{cl_text}): {s.goal}"
+        return f"Goal ({s.status}, {turns}{cl_text}): {s.goal}"
+
+    def render_checklist(self) -> str:
+        """Public helper for the /subgoal slash command."""
+        if self._state is None:
+            return "(no active goal)"
+        if not self._state.checklist:
+            return "(checklist empty — judge will populate it on the next turn)"
+        return self._state.render_checklist(numbered=True)
 
     # --- mutation -----------------------------------------------------
 
@@ -418,6 +1124,8 @@ class GoalManager:
             max_turns=int(max_turns) if max_turns else self.default_max_turns,
             created_at=time.time(),
             last_turn_at=0.0,
+            checklist=[],
+            decomposed=False,
         )
         self._state = state
         save_goal(self.session_id, state)
@@ -456,6 +1164,77 @@ class GoalManager:
         self._state.last_reason = reason
         save_goal(self.session_id, self._state)
 
+    # --- /subgoal user controls ---------------------------------------
+
+    def add_subgoal(self, text: str) -> ChecklistItem:
+        """User appends a new checklist item. Requires an active or paused goal."""
+        if self._state is None:
+            raise RuntimeError("no active goal")
+        text = (text or "").strip()
+        if not text:
+            raise ValueError("subgoal text is empty")
+        item = ChecklistItem(
+            text=text,
+            status=ITEM_PENDING,
+            added_by=ADDED_BY_USER,
+            added_at=time.time(),
+        )
+        self._state.checklist.append(item)
+        save_goal(self.session_id, self._state)
+        return item
+
+    def mark_subgoal(self, index_1based: int, status: str) -> ChecklistItem:
+        """User overrides an item's status.
+
+        ``status`` may be ``completed``, ``impossible``, or ``pending``
+        (the last only as an undo flow). Stickiness rules do NOT apply to
+        user actions — the user is the only authority that can revert
+        terminal items.
+        """
+        if self._state is None:
+            raise RuntimeError("no active goal")
+        status = (status or "").strip().lower()
+        if status not in VALID_ITEM_STATUSES:
+            raise ValueError(
+                f"status must be one of {sorted(VALID_ITEM_STATUSES)}; got {status!r}"
+            )
+        idx = int(index_1based) - 1
+        if idx < 0 or idx >= len(self._state.checklist):
+            raise IndexError(
+                f"index out of range (1..{len(self._state.checklist)})"
+            )
+        item = self._state.checklist[idx]
+        item.status = status
+        if status in TERMINAL_ITEM_STATUSES:
+            item.completed_at = time.time()
+            if not item.evidence:
+                item.evidence = "marked by user"
+        else:
+            item.completed_at = None
+            # Don't wipe judge-supplied evidence on undo — useful audit trail.
+        save_goal(self.session_id, self._state)
+        return item
+
+    def remove_subgoal(self, index_1based: int) -> ChecklistItem:
+        if self._state is None:
+            raise RuntimeError("no active goal")
+        idx = int(index_1based) - 1
+        if idx < 0 or idx >= len(self._state.checklist):
+            raise IndexError(
+                f"index out of range (1..{len(self._state.checklist)})"
+            )
+        removed = self._state.checklist.pop(idx)
+        save_goal(self.session_id, self._state)
+        return removed
+
+    def clear_checklist(self) -> None:
+        """Wipe the checklist and reset decomposed=False so the judge re-decomposes."""
+        if self._state is None:
+            return
+        self._state.checklist = []
+        self._state.decomposed = False
+        save_goal(self.session_id, self._state)
+
     # --- the main entry point called after every turn -----------------
 
     def evaluate_after_turn(
@@ -463,6 +1242,8 @@ class GoalManager:
         last_response: str,
         *,
         user_initiated: bool = True,
+        agent: Any = None,
+        messages: Optional[List[Dict[str, Any]]] = None,
     ) -> Dict[str, Any]:
         """Run the judge and update state. Return a decision dict.
 
@@ -470,11 +1251,21 @@ class GoalManager:
         continuation prompt we fed ourselves (False). Both increment
         ``turns_used`` because both consume model budget.
 
+        ``messages`` is the agent's full conversation list for this session.
+        When provided, it's dumped to ``<HERMES_HOME>/goals/<sid>.json`` so
+        the Phase-B judge's read_file tool can inspect history. Optional —
+        when missing, the judge runs from the snippet only.
+
+        ``agent`` is a back-compat path — when ``messages`` is None we try
+        to extract them from common AIAgent attribute names. Most callers
+        should pass ``messages`` directly because AIAgent does not store
+        the message list as a public instance attribute.
+
         Decision keys:
           - ``status``: current goal status after update
           - ``should_continue``: bool — caller should fire another turn
           - ``continuation_prompt``: str or None
-          - ``verdict``: "done" | "continue" | "skipped" | "inactive"
+          - ``verdict``: "done" | "continue" | "skipped" | "inactive" | "decompose"
           - ``reason``: str
           - ``message``: user-visible one-liner to print/send
         """
@@ -493,7 +1284,45 @@ class GoalManager:
         state.turns_used += 1
         state.last_turn_at = time.time()
 
-        verdict, reason, parse_failed = judge_goal(state.goal, last_response)
+        # ── Phase A: decompose (first call after /goal set) ───────────
+        if not state.decomposed:
+            items, err = decompose_goal(state.goal)
+            state.decomposed = True
+            decompose_message = ""
+            if items:
+                now = time.time()
+                for entry in items:
+                    state.checklist.append(
+                        ChecklistItem(
+                            text=entry["text"],
+                            status=ITEM_PENDING,
+                            added_by=ADDED_BY_JUDGE,
+                            added_at=now,
+                        )
+                    )
+                state.last_verdict = "decompose"
+                state.last_reason = f"decomposed into {len(items)} items"
+                decompose_message = (
+                    f"⊙ Goal checklist created ({len(items)} items). "
+                    f"Use /subgoal to view or edit it."
+                )
+                save_goal(self.session_id, state)
+                return {
+                    "status": "active",
+                    "should_continue": True,
+                    "continuation_prompt": self.next_continuation_prompt(),
+                    "verdict": "decompose",
+                    "reason": state.last_reason,
+                    "message": decompose_message,
+                }
+            # Decompose failed — fall through to freeform mode below.
+            logger.info("goal: decompose failed (%s) — falling back to freeform judge", err)
+            state.last_reason = f"decompose failed: {err}"
+
+        # ── Phase B: evaluate ────────────────────────────────────────
+        verdict, reason, parse_failed = self._evaluate_state_phase_b(
+            state, last_response, agent=agent, messages=messages
+        )
         state.last_verdict = verdict
         state.last_reason = reason
 
@@ -518,11 +1347,7 @@ class GoalManager:
             }
 
         # Auto-pause when the judge model can't produce the expected JSON
-        # verdict N turns in a row. Points the user at the goal_judge config
-        # so they can route this side task to a model that follows the
-        # contract (e.g. google/gemini-3-flash-preview). Without this guard,
-        # weak judge models burn the entire turn budget returning prose or
-        # empty strings.
+        # verdict N turns in a row.
         if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES:
             state.status = "paused"
             state.paused_reason = (
@@ -564,6 +1389,10 @@ class GoalManager:
             }
 
         save_goal(self.session_id, state)
+        cl_total, cl_done, cl_imp, _ = state.checklist_counts()
+        progress = ""
+        if cl_total:
+            progress = f" — {cl_done + cl_imp}/{cl_total} done"
         return {
             "status": "active",
             "should_continue": True,
@@ -571,23 +1400,168 @@ class GoalManager:
             "verdict": "continue",
             "reason": reason,
             "message": (
-                f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}): {reason}"
+                f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}{progress}): {reason}"
             ),
         }
 
+    def _evaluate_state_phase_b(
+        self,
+        state: GoalState,
+        last_response: str,
+        *,
+        agent: Any = None,
+        messages: Optional[List[Dict[str, Any]]] = None,
+    ) -> Tuple[str, str, bool]:
+        """Run the right kind of Phase-B evaluation given current state.
+
+        With a non-empty checklist: harsh per-item evaluation with a bounded
+        read_file tool loop.
+
+        With an empty checklist (e.g. decompose failed twice): fall back to
+        the legacy freeform judge so the goal still has a way to terminate.
+        """
+        if not last_response.strip():
+            return "continue", "empty response (nothing to evaluate)", False
+
+        if state.checklist:
+            # Dump conversation history if we have one. Prefer explicit
+            # ``messages`` arg (most reliable); fall back to extracting from
+            # the agent instance for back-compat.
+            history_path: Optional[Path] = None
+            msgs: List[Dict[str, Any]] = []
+            if messages:
+                msgs = list(messages)
+            elif agent is not None:
+                msgs = self._extract_agent_messages(agent)
+            if msgs:
+                history_path = dump_conversation(self.session_id, msgs)
+                if history_path is None:
+                    logger.debug(
+                        "goal: conversation dump failed for session %s",
+                        self.session_id,
+                    )
+            else:
+                logger.debug(
+                    "goal: no messages available for session %s — judge will run from snippet only",
+                    self.session_id,
+                )
+
+            parsed, parse_failed = evaluate_checklist(
+                state, last_response, history_path=history_path
+            )
+            self._apply_checklist_updates(state, parsed)
+
+            if state.all_terminal():
+                return "done", parsed.get("reason") or "all checklist items terminal", parse_failed
+            return "continue", parsed.get("reason") or "checklist progress", parse_failed
+
+        # No checklist — freeform fallback.
+        verdict, reason, parse_failed = judge_goal_freeform(state.goal, last_response)
+        return verdict, reason, parse_failed
+
+    # --- internal helpers ---------------------------------------------
+
+    @staticmethod
+    def _extract_agent_messages(agent: Any) -> List[Dict[str, Any]]:
+        """Best-effort extraction of the agent's conversation history.
+
+        Tries common attribute names so we don't tightly couple to AIAgent.
+        Returns an empty list when nothing is available.
+        """
+        for attr in ("messages", "conversation_history", "_messages", "history"):
+            try:
+                msgs = getattr(agent, attr, None)
+                if isinstance(msgs, list) and msgs:
+                    return msgs
+            except Exception:
+                continue
+        return []
+
+    @staticmethod
+    def _apply_checklist_updates(state: GoalState, parsed: Dict[str, Any]) -> None:
+        """Apply judge updates with stickiness: never regress terminal items."""
+        now = time.time()
+        for upd in parsed.get("updates") or []:
+            try:
+                idx = int(upd["index"])
+            except (KeyError, TypeError, ValueError):
+                continue
+            if idx < 0 or idx >= len(state.checklist):
+                continue
+            item = state.checklist[idx]
+            if item.status in TERMINAL_ITEM_STATUSES:
+                # Stickiness: judge cannot regress a terminal item.
+                continue
+            new_status = upd.get("status")
+            if new_status not in TERMINAL_ITEM_STATUSES:
+                continue
+            item.status = new_status
+            item.completed_at = now
+            evidence = upd.get("evidence")
+            if evidence:
+                item.evidence = evidence
+
+        for new_item in parsed.get("new_items") or []:
+            text = (new_item.get("text") or "").strip()
+            if not text:
+                continue
+            state.checklist.append(
+                ChecklistItem(
+                    text=text,
+                    status=ITEM_PENDING,
+                    added_by=ADDED_BY_JUDGE,
+                    added_at=now,
+                )
+            )
+
+    # --- continuation prompt ------------------------------------------
+
     def next_continuation_prompt(self) -> Optional[str]:
         if not self._state or self._state.status != "active":
             return None
-        return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal)
+        if not self._state.checklist:
+            return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal)
+        cl_total, cl_done, cl_imp, _ = self._state.checklist_counts()
+        return CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE.format(
+            goal=self._state.goal,
+            done=cl_done + cl_imp,
+            total=cl_total,
+            checklist=self._state.render_checklist(numbered=False),
+        )
+
+
+# Public name kept for back-compat with the previous freeform-only API.
+def judge_goal(
+    goal: str,
+    last_response: str,
+    *,
+    timeout: float = DEFAULT_JUDGE_TIMEOUT,
+) -> Tuple[str, str, bool]:
+    """Back-compat wrapper — defers to the freeform judge."""
+    return judge_goal_freeform(goal, last_response, timeout=timeout)
 
 
 __all__ = [
+    "ChecklistItem",
     "GoalState",
     "GoalManager",
     "CONTINUATION_PROMPT_TEMPLATE",
+    "CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE",
     "DEFAULT_MAX_TURNS",
+    "DEFAULT_MAX_JUDGE_TOOL_CALLS",
+    "ITEM_PENDING",
+    "ITEM_COMPLETED",
+    "ITEM_IMPOSSIBLE",
+    "ITEM_MARKERS",
+    "TERMINAL_ITEM_STATUSES",
+    "VALID_ITEM_STATUSES",
     "load_goal",
     "save_goal",
     "clear_goal",
     "judge_goal",
+    "judge_goal_freeform",
+    "decompose_goal",
+    "evaluate_checklist",
+    "conversation_dump_path",
+    "dump_conversation",
 ]
diff --git a/tests/cli/test_cli_goal_interrupt.py b/tests/cli/test_cli_goal_interrupt.py
index 851b87e856b..879e87c6a73 100644
--- a/tests/cli/test_cli_goal_interrupt.py
+++ b/tests/cli/test_cli_goal_interrupt.py
@@ -58,6 +58,11 @@ def _make_cli_with_goal(session_id: str, goal_text: str = "build a thing"):
 
     mgr = GoalManager(session_id=session_id, default_max_turns=5)
     mgr.set(goal_text)
+    # Skip Phase-A decompose so tests can patch judge_goal_freeform directly
+    # for legacy verdict assertions.
+    mgr.state.decomposed = True
+    from hermes_cli.goals import save_goal as _sg
+    _sg(mgr.session_id, mgr.state)
     cli._goal_manager = mgr
     return cli, mgr
 
@@ -81,7 +86,7 @@ class TestInterruptAutoPause:
 
         # Judge MUST NOT run on an interrupted turn. If it does, we've
         # regressed — fail loudly instead of silently querying a mock.
-        with patch("hermes_cli.goals.judge_goal") as judge_mock:
+        with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock:
             judge_mock.side_effect = AssertionError(
                 "judge_goal called on an interrupted turn"
             )
@@ -106,7 +111,7 @@ class TestInterruptAutoPause:
         cli.conversation_history = [
             {"role": "assistant", "content": "partial"},
         ]
-        with patch("hermes_cli.goals.judge_goal"):
+        with patch("hermes_cli.goals.judge_goal_freeform"):
             cli._maybe_continue_goal_after_turn()
         assert mgr.state.status == "paused"
 
@@ -125,7 +130,7 @@ class TestEmptyResponseSkip:
             {"role": "assistant", "content": "   \n\n   "},
         ]
 
-        with patch("hermes_cli.goals.judge_goal") as judge_mock:
+        with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock:
             judge_mock.side_effect = AssertionError(
                 "judge_goal called on an empty response"
             )
@@ -144,7 +149,7 @@ class TestEmptyResponseSkip:
             {"role": "user", "content": "go"},
         ]
 
-        with patch("hermes_cli.goals.judge_goal") as judge_mock:
+        with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock:
             judge_mock.side_effect = AssertionError(
                 "judge_goal called without an assistant response"
             )
@@ -169,7 +174,7 @@ class TestHealthyTurnStillRuns:
 
         # Force the judge to say "continue" without touching the network.
         with patch(
-            "hermes_cli.goals.judge_goal",
+            "hermes_cli.goals.judge_goal_freeform",
             return_value=("continue", "needs more steps", False),
         ):
             cli._maybe_continue_goal_after_turn()
@@ -189,7 +194,7 @@ class TestHealthyTurnStillRuns:
         ]
 
         with patch(
-            "hermes_cli.goals.judge_goal",
+            "hermes_cli.goals.judge_goal_freeform",
             return_value=("done", "goal satisfied", False),
         ):
             cli._maybe_continue_goal_after_turn()
diff --git a/tests/gateway/test_goal_verdict_send.py b/tests/gateway/test_goal_verdict_send.py
index 14f536aa4f8..ce9e8d6ad1e 100644
--- a/tests/gateway/test_goal_verdict_send.py
+++ b/tests/gateway/test_goal_verdict_send.py
@@ -106,8 +106,11 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home):
 
     mgr = GoalManager(session_entry.session_id)
     mgr.set("ship the feature")
+    mgr.state.decomposed = True
+    from hermes_cli.goals import save_goal as _sg
+    _sg(mgr.session_id, mgr.state)
 
-    with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)):
+    with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "the feature shipped", False)):
         await runner._post_turn_goal_continuation(
             session_entry=session_entry,
             source=src,
@@ -135,8 +138,11 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home):
 
     mgr = GoalManager(session_entry.session_id)
     mgr.set("polish the docs")
+    mgr.state.decomposed = True
+    from hermes_cli.goals import save_goal as _sg
+    _sg(mgr.session_id, mgr.state)
 
-    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)):
+    with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "still needs work", False)):
         await runner._post_turn_goal_continuation(
             session_entry=session_entry,
             source=src,
@@ -164,7 +170,7 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home):
     state.turns_used = 2
     save_goal(session_entry.session_id, state)
 
-    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)):
+    with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "keep going", False)):
         await runner._post_turn_goal_continuation(
             session_entry=session_entry,
             source=src,
@@ -211,7 +217,7 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home):
 
     runner.adapters[Platform.TELEGRAM] = _NoSendAdapter()
 
-    with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)):
+    with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "ok", False)):
         # must not raise
         await runner._post_turn_goal_continuation(
             session_entry=session_entry,
diff --git a/tests/hermes_cli/test_goals.py b/tests/hermes_cli/test_goals.py
index b5afd716c9e..47a043a32cf 100644
--- a/tests/hermes_cli/test_goals.py
+++ b/tests/hermes_cli/test_goals.py
@@ -253,14 +253,20 @@ class TestGoalManager:
         assert mgr2.is_active()
 
     def test_evaluate_after_turn_done(self, hermes_home):
-        """Judge says done → status=done, no continuation."""
+        """Judge says done → status=done, no continuation.
+
+        Skips Phase-A decompose by patching ``decompose_goal`` to return
+        an empty checklist so the manager falls through to the freeform
+        judge path (legacy behavior preserved when decompose is unavailable).
+        """
         from hermes_cli import goals
         from hermes_cli.goals import GoalManager
 
         mgr = GoalManager(session_id="eval-sid-1")
         mgr.set("ship it")
 
-        with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)):
+        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
+             patch.object(goals, "judge_goal_freeform", return_value=("done", "shipped", False)):
             decision = mgr.evaluate_after_turn("I shipped the feature.")
 
         assert decision["verdict"] == "done"
@@ -276,7 +282,8 @@ class TestGoalManager:
         mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
         mgr.set("a long goal")
 
-        with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)):
+        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
+             patch.object(goals, "judge_goal_freeform", return_value=("continue", "more work", False)):
             decision = mgr.evaluate_after_turn("made some progress")
 
         assert decision["verdict"] == "continue"
@@ -294,7 +301,8 @@ class TestGoalManager:
         mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
         mgr.set("hard goal")
 
-        with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)):
+        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
+             patch.object(goals, "judge_goal_freeform", return_value=("continue", "not yet", False)):
             d1 = mgr.evaluate_after_turn("step 1")
             assert d1["should_continue"] is True
             assert mgr.state.turns_used == 1
@@ -434,9 +442,11 @@ class TestJudgeParseFailureAutoPause:
         mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20)
         mgr.set("do a thing")
 
-        with patch.object(
-            goals, "judge_goal", return_value=("continue", "judge returned empty response", True)
-        ):
+        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
+             patch.object(
+                 goals, "judge_goal_freeform",
+                 return_value=("continue", "judge returned empty response", True),
+             ):
             d1 = mgr.evaluate_after_turn("step 1")
             assert d1["should_continue"] is True
             assert mgr.state.consecutive_parse_failures == 1
@@ -463,17 +473,21 @@ class TestJudgeParseFailureAutoPause:
         mgr.set("another goal")
 
         # Two parse failures…
-        with patch.object(
-            goals, "judge_goal", return_value=("continue", "not json", True)
-        ):
+        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
+             patch.object(
+                 goals, "judge_goal_freeform",
+                 return_value=("continue", "not json", True),
+             ):
             mgr.evaluate_after_turn("step 1")
             mgr.evaluate_after_turn("step 2")
             assert mgr.state.consecutive_parse_failures == 2
 
         # …then one clean reply resets the counter.
-        with patch.object(
-            goals, "judge_goal", return_value=("continue", "making progress", False)
-        ):
+        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
+             patch.object(
+                 goals, "judge_goal_freeform",
+                 return_value=("continue", "making progress", False),
+             ):
             d = mgr.evaluate_after_turn("step 3")
             assert d["should_continue"] is True
             assert mgr.state.consecutive_parse_failures == 0
@@ -486,9 +500,11 @@ class TestJudgeParseFailureAutoPause:
         mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20)
         mgr.set("goal")
 
-        with patch.object(
-            goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False)
-        ):
+        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
+             patch.object(
+                 goals, "judge_goal_freeform",
+                 return_value=("continue", "judge error: RuntimeError", False),
+             ):
             for _ in range(5):
                 d = mgr.evaluate_after_turn("still going")
                 assert d["should_continue"] is True
@@ -505,12 +521,553 @@ class TestJudgeParseFailureAutoPause:
         mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20)
         mgr.set("persistent goal")
 
-        with patch.object(
-            goals, "judge_goal", return_value=("continue", "empty", True)
-        ):
+        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
+             patch.object(
+                 goals, "judge_goal_freeform",
+                 return_value=("continue", "empty", True),
+             ):
             mgr.evaluate_after_turn("r")
             mgr.evaluate_after_turn("r")
 
         reloaded = load_goal("parse-fail-sid-4")
         assert reloaded is not None
         assert reloaded.consecutive_parse_failures == 2
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Checklist mode: GoalState backcompat + ChecklistItem
+# ──────────────────────────────────────────────────────────────────────
+
+
+class TestGoalStateBackcompat:
+    def test_old_state_meta_row_loads_without_checklist_fields(self):
+        """A goal serialized BEFORE the checklist fields existed must
+        round-trip through GoalState.from_json with empty defaults."""
+        from hermes_cli.goals import GoalState
+
+        legacy_json = json.dumps({
+            "goal": "do the thing",
+            "status": "active",
+            "turns_used": 3,
+            "max_turns": 20,
+            "created_at": 1.0,
+            "last_turn_at": 2.0,
+            "last_verdict": "continue",
+            "last_reason": "still working",
+            "paused_reason": None,
+            "consecutive_parse_failures": 1,
+        })
+        state = GoalState.from_json(legacy_json)
+        assert state.goal == "do the thing"
+        assert state.checklist == []
+        assert state.decomposed is False
+
+    def test_new_state_round_trip(self):
+        from hermes_cli.goals import (
+            ChecklistItem,
+            GoalState,
+            ITEM_COMPLETED,
+            ITEM_PENDING,
+            ADDED_BY_JUDGE,
+            ADDED_BY_USER,
+        )
+
+        state = GoalState(
+            goal="g",
+            decomposed=True,
+            checklist=[
+                ChecklistItem(text="a", status=ITEM_COMPLETED,
+                              added_by=ADDED_BY_JUDGE, evidence="done"),
+                ChecklistItem(text="b", status=ITEM_PENDING,
+                              added_by=ADDED_BY_USER),
+            ],
+        )
+        round_tripped = GoalState.from_json(state.to_json())
+        assert round_tripped.decomposed is True
+        assert len(round_tripped.checklist) == 2
+        assert round_tripped.checklist[0].text == "a"
+        assert round_tripped.checklist[0].status == ITEM_COMPLETED
+        assert round_tripped.checklist[0].evidence == "done"
+        assert round_tripped.checklist[1].added_by == ADDED_BY_USER
+
+    def test_checklist_counts_and_all_terminal(self):
+        from hermes_cli.goals import (
+            ChecklistItem, GoalState,
+            ITEM_COMPLETED, ITEM_IMPOSSIBLE, ITEM_PENDING,
+        )
+
+        state = GoalState(
+            goal="g",
+            checklist=[
+                ChecklistItem(text="a", status=ITEM_COMPLETED),
+                ChecklistItem(text="b", status=ITEM_IMPOSSIBLE),
+                ChecklistItem(text="c", status=ITEM_PENDING),
+            ],
+        )
+        total, done, imp, pending = state.checklist_counts()
+        assert (total, done, imp, pending) == (3, 1, 1, 1)
+        assert state.all_terminal() is False
+
+        state.checklist[2].status = ITEM_IMPOSSIBLE
+        assert state.all_terminal() is True
+
+    def test_empty_checklist_is_not_all_terminal(self):
+        """Empty list must NOT be considered done."""
+        from hermes_cli.goals import GoalState
+
+        state = GoalState(goal="g")
+        assert state.all_terminal() is False
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Phase A: decompose
+# ──────────────────────────────────────────────────────────────────────
+
+
+class TestPhaseADecompose:
+    def test_decompose_writes_checklist_and_marks_decomposed(self, hermes_home):
+        from hermes_cli import goals
+        from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_JUDGE
+
+        mgr = GoalManager(session_id="phase-a-sid-1")
+        mgr.set("build a website")
+
+        items = [{"text": "homepage exists"}, {"text": "is mobile-friendly"}]
+        with patch.object(goals, "decompose_goal", return_value=(items, None)):
+            d = mgr.evaluate_after_turn("(initial response)")
+
+        assert d["verdict"] == "decompose"
+        assert d["should_continue"] is True
+        # Phase A produces a continuation prompt that includes the checklist.
+        assert d["continuation_prompt"] is not None
+        assert "Checklist progress" in d["continuation_prompt"]
+        assert mgr.state.decomposed is True
+        assert len(mgr.state.checklist) == 2
+        assert mgr.state.checklist[0].text == "homepage exists"
+        assert mgr.state.checklist[0].status == ITEM_PENDING
+        assert mgr.state.checklist[0].added_by == ADDED_BY_JUDGE
+
+    def test_decompose_only_runs_once(self, hermes_home):
+        """Decomposed=True after first call. Subsequent calls go to Phase B."""
+        from hermes_cli import goals
+        from hermes_cli.goals import GoalManager
+
+        mgr = GoalManager(session_id="phase-a-sid-2")
+        mgr.set("g")
+
+        with patch.object(
+            goals, "decompose_goal", return_value=([{"text": "x"}], None)
+        ) as decompose_mock, patch.object(
+            goals, "evaluate_checklist",
+            return_value=({"updates": [], "new_items": [], "reason": "..."}, False),
+        ) as eval_mock:
+            mgr.evaluate_after_turn("turn 1")
+            mgr.evaluate_after_turn("turn 2")
+            mgr.evaluate_after_turn("turn 3")
+
+        assert decompose_mock.call_count == 1
+        assert eval_mock.call_count == 2
+
+    def test_decompose_failure_falls_back_to_freeform(self, hermes_home):
+        """If decompose returns no items, manager falls through to freeform judge."""
+        from hermes_cli import goals
+        from hermes_cli.goals import GoalManager
+
+        mgr = GoalManager(session_id="phase-a-sid-3")
+        mgr.set("g")
+
+        with patch.object(goals, "decompose_goal", return_value=([], "model error")), \
+             patch.object(goals, "judge_goal_freeform",
+                          return_value=("done", "shipped", False)):
+            d = mgr.evaluate_after_turn("done!")
+
+        assert d["verdict"] == "done"
+        assert mgr.state.decomposed is True
+        assert mgr.state.checklist == []
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Phase B: evaluate (checklist mode)
+# ──────────────────────────────────────────────────────────────────────
+
+
+class TestPhaseBChecklist:
+    def _make_decomposed_mgr(self, sid: str, items):
+        """Helper: skip Phase A, install a decomposed checklist directly."""
+        from hermes_cli.goals import (
+            GoalManager, ChecklistItem, ITEM_PENDING, ADDED_BY_JUDGE,
+        )
+        from hermes_cli import goals as _g
+        mgr = GoalManager(session_id=sid)
+        mgr.set("a goal")
+        mgr.state.decomposed = True
+        mgr.state.checklist = [
+            ChecklistItem(text=t, status=ITEM_PENDING, added_by=ADDED_BY_JUDGE)
+            for t in items
+        ]
+        _g.save_goal(sid, mgr.state)
+        return mgr
+
+    def test_judge_flips_pending_to_completed(self, hermes_home):
+        from hermes_cli import goals
+        from hermes_cli.goals import ITEM_COMPLETED, ITEM_PENDING
+
+        mgr = self._make_decomposed_mgr("phase-b-1", ["a", "b", "c"])
+        with patch.object(
+            goals, "evaluate_checklist",
+            return_value=(
+                {
+                    "updates": [
+                        {"index": 0, "status": "completed", "evidence": "done"},
+                        {"index": 1, "status": "completed", "evidence": "shipped"},
+                    ],
+                    "new_items": [],
+                    "reason": "made progress",
+                },
+                False,
+            ),
+        ):
+            d = mgr.evaluate_after_turn("agent did stuff")
+
+        assert d["verdict"] == "continue"
+        assert mgr.state.checklist[0].status == ITEM_COMPLETED
+        assert mgr.state.checklist[0].evidence == "done"
+        assert mgr.state.checklist[1].status == ITEM_COMPLETED
+        assert mgr.state.checklist[2].status == ITEM_PENDING
+
+    def test_goal_done_when_all_items_terminal(self, hermes_home):
+        from hermes_cli import goals
+
+        mgr = self._make_decomposed_mgr("phase-b-2", ["a", "b"])
+        with patch.object(
+            goals, "evaluate_checklist",
+            return_value=(
+                {
+                    "updates": [
+                        {"index": 0, "status": "completed", "evidence": "ok"},
+                        {"index": 1, "status": "impossible", "evidence": "blocked"},
+                    ],
+                    "new_items": [],
+                    "reason": "all done or blocked",
+                },
+                False,
+            ),
+        ):
+            d = mgr.evaluate_after_turn("response")
+
+        assert d["verdict"] == "done"
+        assert d["should_continue"] is False
+        assert mgr.state.status == "done"
+
+    def test_stickiness_judge_cannot_regress_completed(self, hermes_home):
+        """Once an item is completed, judge updates trying to flip it back are ignored."""
+        from hermes_cli import goals
+        from hermes_cli.goals import ITEM_COMPLETED
+
+        mgr = self._make_decomposed_mgr("phase-b-stick", ["a"])
+        # First turn completes item 0.
+        with patch.object(
+            goals, "evaluate_checklist",
+            return_value=(
+                {
+                    "updates": [{"index": 0, "status": "completed", "evidence": "yes"}],
+                    "new_items": [],
+                    "reason": "done",
+                },
+                False,
+            ),
+        ):
+            mgr.evaluate_after_turn("turn 1")
+        assert mgr.state.checklist[0].status == ITEM_COMPLETED
+        # Second turn: judge tries to send a non-terminal update.
+        # _parse_evaluate_response already filters non-terminal, but at the
+        # apply layer we also skip terminal items entirely. Smoke both.
+        with patch.object(
+            goals, "evaluate_checklist",
+            return_value=(
+                {
+                    "updates": [{"index": 0, "status": "impossible", "evidence": "regress"}],
+                    "new_items": [],
+                    "reason": "trying to regress",
+                },
+                False,
+            ),
+        ):
+            mgr.evaluate_after_turn("turn 2")
+        # Sticky: status stays completed, evidence unchanged.
+        assert mgr.state.checklist[0].status == ITEM_COMPLETED
+        assert mgr.state.checklist[0].evidence == "yes"
+
+    def test_judge_appends_new_items(self, hermes_home):
+        from hermes_cli import goals
+
+        mgr = self._make_decomposed_mgr("phase-b-new", ["a"])
+        with patch.object(
+            goals, "evaluate_checklist",
+            return_value=(
+                {
+                    "updates": [],
+                    "new_items": [{"text": "newly discovered"}, {"text": "also this"}],
+                    "reason": "found more work",
+                },
+                False,
+            ),
+        ):
+            mgr.evaluate_after_turn("response")
+        assert len(mgr.state.checklist) == 3
+        assert mgr.state.checklist[1].text == "newly discovered"
+        assert mgr.state.checklist[1].added_by == "judge"
+
+
+# ──────────────────────────────────────────────────────────────────────
+# /subgoal user controls
+# ──────────────────────────────────────────────────────────────────────
+
+
+class TestSubgoalUserControls:
+    def test_add_subgoal_appends_user_item(self, hermes_home):
+        from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_USER
+
+        mgr = GoalManager(session_id="user-sid-1")
+        mgr.set("g")
+        item = mgr.add_subgoal("user added")
+        assert item.text == "user added"
+        assert item.status == ITEM_PENDING
+        assert item.added_by == ADDED_BY_USER
+        assert len(mgr.state.checklist) == 1
+
+    def test_add_subgoal_requires_active_goal(self, hermes_home):
+        from hermes_cli.goals import GoalManager
+        mgr = GoalManager(session_id="user-sid-2")
+        with pytest.raises(RuntimeError):
+            mgr.add_subgoal("x")
+
+    def test_add_subgoal_rejects_empty_text(self, hermes_home):
+        from hermes_cli.goals import GoalManager
+        mgr = GoalManager(session_id="user-sid-3")
+        mgr.set("g")
+        with pytest.raises(ValueError):
+            mgr.add_subgoal("   ")
+
+    def test_mark_subgoal_uses_1_based_index(self, hermes_home):
+        from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_IMPOSSIBLE
+        mgr = GoalManager(session_id="user-sid-4")
+        mgr.set("g")
+        mgr.add_subgoal("a")
+        mgr.add_subgoal("b")
+        mgr.add_subgoal("c")
+        mgr.mark_subgoal(2, "completed")
+        mgr.mark_subgoal(3, "impossible")
+        assert mgr.state.checklist[0].status == "pending"
+        assert mgr.state.checklist[1].status == ITEM_COMPLETED
+        assert mgr.state.checklist[2].status == ITEM_IMPOSSIBLE
+
+    def test_mark_subgoal_rejects_invalid_index(self, hermes_home):
+        from hermes_cli.goals import GoalManager
+        mgr = GoalManager(session_id="user-sid-5")
+        mgr.set("g")
+        mgr.add_subgoal("a")
+        with pytest.raises(IndexError):
+            mgr.mark_subgoal(5, "completed")
+        with pytest.raises(IndexError):
+            mgr.mark_subgoal(0, "completed")
+
+    def test_user_can_revert_terminal_item(self, hermes_home):
+        """User mark_subgoal bypasses stickiness — only path to revert."""
+        from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_PENDING
+        mgr = GoalManager(session_id="user-sid-6")
+        mgr.set("g")
+        mgr.add_subgoal("a")
+        mgr.mark_subgoal(1, "completed")
+        assert mgr.state.checklist[0].status == ITEM_COMPLETED
+        mgr.mark_subgoal(1, "pending")
+        assert mgr.state.checklist[0].status == ITEM_PENDING
+
+    def test_remove_subgoal(self, hermes_home):
+        from hermes_cli.goals import GoalManager
+        mgr = GoalManager(session_id="user-sid-7")
+        mgr.set("g")
+        mgr.add_subgoal("a")
+        mgr.add_subgoal("b")
+        mgr.add_subgoal("c")
+        removed = mgr.remove_subgoal(2)
+        assert removed.text == "b"
+        assert [it.text for it in mgr.state.checklist] == ["a", "c"]
+
+    def test_clear_checklist_resets_decomposed(self, hermes_home):
+        from hermes_cli.goals import GoalManager
+        mgr = GoalManager(session_id="user-sid-8")
+        mgr.set("g")
+        mgr.state.decomposed = True
+        mgr.add_subgoal("a")
+        mgr.clear_checklist()
+        assert mgr.state.checklist == []
+        assert mgr.state.decomposed is False
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Conversation dump
+# ──────────────────────────────────────────────────────────────────────
+
+
+class TestConversationDump:
+    def test_dump_writes_messages_to_goals_dir(self, hermes_home):
+        from hermes_cli.goals import dump_conversation, conversation_dump_path
+
+        msgs = [
+            {"role": "user", "content": "hi"},
+            {"role": "assistant", "content": "hello"},
+        ]
+        path = dump_conversation("dump-sid-1", msgs)
+        assert path is not None
+        assert path.exists()
+        # Path is under <HERMES_HOME>/goals/<sid>.json
+        assert path.parent.name == "goals"
+        assert path.name == "dump-sid-1.json"
+
+        loaded = json.loads(path.read_text())
+        assert loaded == msgs
+
+        # conversation_dump_path returns the same path
+        assert conversation_dump_path("dump-sid-1") == path
+
+    def test_dump_handles_unsafe_session_id(self, hermes_home):
+        from hermes_cli.goals import dump_conversation
+
+        path = dump_conversation("evil/../../sid", [{"role": "user", "content": "x"}])
+        assert path is not None
+        # No traversal — slashes are normalized to underscores. (Periods are
+        # preserved because they're legitimate in filenames; the resulting
+        # name still cannot escape <HERMES_HOME>/goals/ since path
+        # separators are gone.)
+        assert "/" not in path.name
+        assert path.parent.name == "goals"
+        # Verify the resolved path stays under the goals dir.
+        from hermes_cli.goals import _goals_dump_dir
+        goals_dir = _goals_dump_dir().resolve()
+        assert str(path.resolve()).startswith(str(goals_dir))
+
+    def test_dump_skips_when_messages_empty(self, hermes_home):
+        from hermes_cli.goals import dump_conversation
+        assert dump_conversation("sid", []) is None
+        assert dump_conversation("", [{"role": "user", "content": "x"}]) is None
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Judge read_file tool: path restriction
+# ──────────────────────────────────────────────────────────────────────
+
+
+class TestJudgeReadFile:
+    def test_restricted_to_allowed_path(self, hermes_home, tmp_path):
+        from hermes_cli.goals import _judge_read_file
+
+        allowed = tmp_path / "allowed.json"
+        allowed.write_text("hello\nworld\n")
+
+        ok = _judge_read_file(str(allowed), allowed_path=allowed)
+        loaded = json.loads(ok)
+        assert loaded["content"].startswith("hello")
+
+        # Try to read a different file.
+        sneaky = tmp_path / "secret.txt"
+        sneaky.write_text("nope\n")
+        denied = _judge_read_file(str(sneaky), allowed_path=allowed)
+        loaded = json.loads(denied)
+        assert "error" in loaded
+        assert "restricted" in loaded["error"]
+
+    def test_pagination(self, hermes_home, tmp_path):
+        from hermes_cli.goals import _judge_read_file
+        f = tmp_path / "big.json"
+        f.write_text("\n".join(f"line-{i}" for i in range(50)) + "\n")
+
+        # offset=10, limit=5 should return lines 10..14.
+        result = json.loads(_judge_read_file(str(f), offset=10, limit=5, allowed_path=f))
+        assert result["returned"] == 5
+        assert "line-9" in result["content"]   # 1-based: line 10 == zero-indexed 9
+        assert result["next_offset"] == 15
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Index conversion: judge emits 1-based, apply layer uses 0-based
+# ──────────────────────────────────────────────────────────────────────
+
+
+class TestJudgeIndexConversion:
+    def test_parse_evaluate_converts_1based_to_0based(self):
+        """The judge sees the checklist with 1-based indices (rendered as
+        '1. [ ] foo, 2. [ ] bar'). It emits updates with those same indices.
+        ``_parse_evaluate_response`` must convert them to 0-based so the
+        apply layer can index ``state.checklist`` directly.
+        """
+        from hermes_cli.goals import _parse_evaluate_response
+
+        raw = '''
+        {"updates": [
+            {"index": 1, "status": "completed", "evidence": "first item"},
+            {"index": 3, "status": "impossible", "evidence": "third item"}
+        ],
+         "new_items": [],
+         "reason": "evaluated"}
+        '''
+        parsed, parse_failed = _parse_evaluate_response(raw)
+        assert parse_failed is False
+        # 1 → 0, 3 → 2
+        assert [u["index"] for u in parsed["updates"]] == [0, 2]
+        assert parsed["updates"][0]["evidence"] == "first item"
+        assert parsed["updates"][1]["status"] == "impossible"
+
+    def test_full_round_trip_judge_index_to_state(self, hermes_home):
+        """End-to-end: judge emits 1-based, parser converts, apply layer
+        flips the right items in state.checklist."""
+        from hermes_cli import goals
+        from hermes_cli.goals import (
+            GoalManager, ChecklistItem, ITEM_PENDING, ITEM_COMPLETED,
+            ADDED_BY_JUDGE,
+        )
+
+        mgr = GoalManager(session_id="idx-round-trip")
+        mgr.set("g")
+        mgr.state.decomposed = True
+        mgr.state.checklist = [
+            ChecklistItem(text="first", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
+            ChecklistItem(text="second", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
+            ChecklistItem(text="third", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
+        ]
+        goals.save_goal("idx-round-trip", mgr.state)
+
+        # Simulate the judge returning a raw-JSON Phase-B reply via the
+        # auxiliary client: the parser handles the 1-based → 0-based
+        # conversion so the apply layer flips item 1 (text="first").
+        class FakeMessage:
+            content = '''
+            {"updates": [{"index": 1, "status": "completed", "evidence": "first done"}],
+             "new_items": [],
+             "reason": "..."}
+            '''
+            tool_calls = None
+
+        class FakeChoice:
+            message = FakeMessage()
+
+        class FakeResponse:
+            choices = [FakeChoice()]
+
+        class FakeClient:
+            class chat:
+                class completions:
+                    @staticmethod
+                    def create(**kwargs):
+                        return FakeResponse()
+
+        with patch.object(goals, "_get_judge_client", return_value=(FakeClient, "fake-model")):
+            mgr.evaluate_after_turn("ran the script and item 1 is done")
+
+        # Item 1 (text="first") should now be completed.
+        assert mgr.state.checklist[0].text == "first"
+        assert mgr.state.checklist[0].status == ITEM_COMPLETED
+        assert mgr.state.checklist[0].evidence == "first done"
+        # Other items still pending.
+        assert mgr.state.checklist[1].status == ITEM_PENDING
+        assert mgr.state.checklist[2].status == ITEM_PENDING
diff --git a/tui_gateway/server.py b/tui_gateway/server.py
index 73b2fc1a811..07febffaf98 100644
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@@ -3251,6 +3251,7 @@ def _run_prompt_submit(rid, sid: str, session: dict, text: Any) -> None:
                             decision = goal_mgr.evaluate_after_turn(
                                 raw,
                                 user_initiated=True,
+                                messages=list(session.get("history") or []),
                             )
                             verdict_msg = decision.get("message") or ""
                             if verdict_msg: