From 3e7145e0bbcded852a5324ceb549fe5ca94ac924 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Mon, 11 May 2026 07:06:27 -0700
Subject: [PATCH] revert: roll back /goal checklist + /subgoal feature stack
 (#23813)

* Revert "fix(goals): force judge to use tool calls instead of JSON-text replies (#23547)"

This reverts commit a63a2b7c78562cd4eaf33f5f7db81ae0b3938552.

* Revert "fix(goals): forward standing /goal state on auto-compression session rotation (#23530)"

This reverts commit 4a080b1d5aa7528a679880c93147bc7fffdd267a.

* Revert "feat(goals): /goal checklist + /subgoal user controls (#23456)"

This reverts commit 404640a2b752f502825dc8b26212204fa890d495.
---
 cli.py                                  |  105 +-
 gateway/run.py                          |  101 +-
 hermes_cli/commands.py                  |    2 -
 hermes_cli/goals.py                     | 1442 ++---------------------
 run_agent.py                            |   19 -
 tests/cli/test_cli_goal_interrupt.py    |   17 +-
 tests/gateway/test_goal_verdict_send.py |   14 +-
 tests/hermes_cli/test_goals.py          |  899 +-------------
 tui_gateway/server.py                   |    1 -
 9 files changed, 113 insertions(+), 2487 deletions(-)
diff --git a/cli.py b/cli.py
index fd9cc275e8a..bb1bdb61764 100644
--- a/cli.py
+++ b/cli.py
@@ -7252,8 +7252,6 @@ class HermesCLI:
                 _cprint(f"  No agent running; queued as next turn: {payload[:80]}{'...' if len(payload) > 80 else ''}")
         elif canonical == "goal":
             self._handle_goal_command(cmd_original)
-        elif canonical == "subgoal":
-            self._handle_subgoal_command(cmd_original)
         elif canonical == "skin":
             self._handle_skin_command(cmd_original)
         elif canonical == "voice":
@@ -7850,103 +7848,6 @@ class HermesCLI:
         except Exception:
             pass
 
-    def _handle_subgoal_command(self, cmd: str) -> None:
-        """Dispatch /subgoal subcommands.
-
-        Forms:
-          /subgoal                              show the checklist
-          /subgoal <text>                       append a user item
-          /subgoal complete <n>                 mark item n completed
-          /subgoal impossible <n>               mark item n impossible
-          /subgoal undo <n>                     revert item n to pending
-          /subgoal remove <n>                   delete item n
-          /subgoal clear                        wipe the checklist (judge re-decomposes)
-        """
-        parts = (cmd or "").strip().split(None, 2)
-        # parts[0] == "/subgoal"; remainder is what the user typed
-        arg = " ".join(parts[1:]).strip() if len(parts) > 1 else ""
-
-        mgr = self._get_goal_manager()
-        if mgr is None:
-            _cprint(f"  {_DIM}Goals unavailable (no active session).{_RST}")
-            return
-
-        if not mgr.has_goal():
-            _cprint(f"  {_DIM}No active goal. Set one with /goal <text>.{_RST}")
-            return
-
-        # No args → show the checklist.
-        if not arg:
-            _cprint(f"  {mgr.status_line()}")
-            _cprint(f"  {mgr.render_checklist()}")
-            return
-
-        tokens = arg.split(None, 1)
-        verb = tokens[0].lower()
-        rest = tokens[1].strip() if len(tokens) > 1 else ""
-
-        # Action verbs operate on indices.
-        action_status_map = {
-            "complete": "completed",
-            "completed": "completed",
-            "done": "completed",
-            "impossible": "impossible",
-            "imp": "impossible",
-            "skip": "impossible",
-            "undo": "pending",
-            "pending": "pending",
-            "reset": "pending",
-        }
-        if verb in action_status_map:
-            if not rest:
-                _cprint(f"  Usage: /subgoal {verb} <n>")
-                return
-            try:
-                idx = int(rest.split()[0])
-            except ValueError:
-                _cprint(f"  /subgoal {verb}: <n> must be an integer (1-based index).")
-                return
-            try:
-                item = mgr.mark_subgoal(idx, action_status_map[verb])
-            except (IndexError, ValueError, RuntimeError) as exc:
-                _cprint(f"  /subgoal {verb}: {exc}")
-                return
-            _cprint(f"  ✓ Item {idx} → {item.status}: {item.text}")
-            return
-
-        if verb == "remove":
-            if not rest:
-                _cprint("  Usage: /subgoal remove <n>")
-                return
-            try:
-                idx = int(rest.split()[0])
-            except ValueError:
-                _cprint("  /subgoal remove: <n> must be an integer (1-based index).")
-                return
-            try:
-                removed = mgr.remove_subgoal(idx)
-            except (IndexError, RuntimeError) as exc:
-                _cprint(f"  /subgoal remove: {exc}")
-                return
-            _cprint(f"  ✓ Removed item {idx}: {removed.text}")
-            return
-
-        if verb == "clear":
-            mgr.clear_checklist()
-            _cprint(
-                "  ✓ Checklist cleared. The judge will re-decompose on the next turn."
-            )
-            return
-
-        # Otherwise: append `arg` as a user-authored checklist item.
-        try:
-            item = mgr.add_subgoal(arg)
-        except (ValueError, RuntimeError) as exc:
-            _cprint(f"  /subgoal: {exc}")
-            return
-        idx = len(mgr.state.checklist) if mgr.state else 0
-        _cprint(f"  ✓ Added subgoal {idx}: {item.text}")
-
     def _maybe_continue_goal_after_turn(self) -> None:
         """Hook run after every CLI turn. Judges + maybe re-queues.
 
@@ -8024,11 +7925,7 @@ class HermesCLI:
         if not last_response.strip():
             return
 
-        decision = mgr.evaluate_after_turn(
-            last_response,
-            user_initiated=True,
-            messages=getattr(self, "conversation_history", None) or [],
-        )
+        decision = mgr.evaluate_after_turn(last_response, user_initiated=True)
         msg = decision.get("message") or ""
         if msg:
             _cprint(f"  {msg}")
diff --git a/gateway/run.py b/gateway/run.py
index 7b2d7dea333..6c652e7cf10 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -6083,12 +6083,6 @@ class GatewayRunner:
                     return await self._handle_goal_command(event)
                 return "Agent is running — use /goal status / pause / clear mid-run, or /stop before setting a new goal."
 
-            # /subgoal is safe mid-run — it only modifies the active goal's
-            # checklist, which the judge consults at turn boundaries. There
-            # is no race with the running turn.
-            if _cmd_def_inner and _cmd_def_inner.name == "subgoal":
-                return await self._handle_subgoal_command(event)
-
             # Session-level toggles that are safe to run mid-agent —
             # /yolo can unblock a pending approval prompt, /verbose cycles
             # the tool-progress display mode for the ongoing stream.
@@ -6467,9 +6461,6 @@ class GatewayRunner:
         if canonical == "goal":
             return await self._handle_goal_command(event)
 
-        if canonical == "subgoal":
-            return await self._handle_subgoal_command(event)
-
         if canonical == "voice":
             return await self._handle_voice_command(event)
 
@@ -6654,18 +6645,10 @@ class GatewayRunner:
                     except Exception:
                         session_entry = None
                     if session_entry is not None:
-                        # Pull the agent's full messages list from the result
-                        # so the judge can dump it for its read_file tool.
-                        _agent_messages: list = []
-                        if isinstance(_agent_result, dict):
-                            _msgs = _agent_result.get("messages")
-                            if isinstance(_msgs, list):
-                                _agent_messages = _msgs
                         await self._post_turn_goal_continuation(
                             session_entry=session_entry,
                             source=source,
                             final_response=_final_text,
-                            agent_messages=_agent_messages,
                         )
             except Exception as _goal_exc:
                 logger.debug("goal continuation hook failed: %s", _goal_exc)
@@ -9402,83 +9385,6 @@ class GatewayRunner:
 
         return t("gateway.goal.set", budget=state.max_turns, goal=state.goal)
 
-    async def _handle_subgoal_command(self, event: "MessageEvent") -> str:
-        """Handle /subgoal for gateway platforms.
-
-        Forms (mirror of CLI):
-          /subgoal                              show the checklist
-          /subgoal <text>                       append a user item
-          /subgoal complete <n> | done <n>      mark item n completed
-          /subgoal impossible <n>               mark item n impossible
-          /subgoal undo <n>                     revert item n to pending
-          /subgoal remove <n>                   delete item n
-          /subgoal clear                        wipe the checklist
-        """
-        args = (event.get_command_args() or "").strip()
-
-        mgr, _session_entry = self._get_goal_manager_for_event(event)
-        if mgr is None:
-            return t("gateway.goal.unavailable")
-
-        if not mgr.has_goal():
-            return "No active goal. Set one with /goal <text>."
-
-        if not args:
-            return f"{mgr.status_line()}\n{mgr.render_checklist()}"
-
-        tokens = args.split(None, 1)
-        verb = tokens[0].lower()
-        rest = tokens[1].strip() if len(tokens) > 1 else ""
-
-        action_status_map = {
-            "complete": "completed",
-            "completed": "completed",
-            "done": "completed",
-            "impossible": "impossible",
-            "imp": "impossible",
-            "skip": "impossible",
-            "undo": "pending",
-            "pending": "pending",
-            "reset": "pending",
-        }
-        if verb in action_status_map:
-            if not rest:
-                return f"Usage: /subgoal {verb} <n>"
-            try:
-                idx = int(rest.split()[0])
-            except ValueError:
-                return f"/subgoal {verb}: <n> must be an integer (1-based index)."
-            try:
-                item = mgr.mark_subgoal(idx, action_status_map[verb])
-            except (IndexError, ValueError, RuntimeError) as exc:
-                return f"/subgoal {verb}: {exc}"
-            return f"✓ Item {idx} → {item.status}: {item.text}"
-
-        if verb == "remove":
-            if not rest:
-                return "Usage: /subgoal remove <n>"
-            try:
-                idx = int(rest.split()[0])
-            except ValueError:
-                return "/subgoal remove: <n> must be an integer (1-based index)."
-            try:
-                removed = mgr.remove_subgoal(idx)
-            except (IndexError, RuntimeError) as exc:
-                return f"/subgoal remove: {exc}"
-            return f"✓ Removed item {idx}: {removed.text}"
-
-        if verb == "clear":
-            mgr.clear_checklist()
-            return "✓ Checklist cleared. The judge will re-decompose on the next turn."
-
-        # Otherwise — append `args` as a new user-authored checklist item.
-        try:
-            item = mgr.add_subgoal(args)
-        except (ValueError, RuntimeError) as exc:
-            return f"/subgoal: {exc}"
-        idx = len(mgr.state.checklist) if mgr.state else 0
-        return f"✓ Added subgoal {idx}: {item.text}"
-
     async def _send_goal_status_notice(self, source: Any, message: str) -> None:
         """Send a /goal judge status line back to the originating chat/thread."""
         adapter = self.adapters.get(source.platform)
@@ -9547,7 +9453,6 @@ class GatewayRunner:
         session_entry: Any,
         source: Any,
         final_response: str,
-        agent_messages: Optional[list] = None,
     ) -> None:
         """Run the goal judge after a gateway turn and, if still active,
         enqueue a continuation prompt for the same session.
@@ -9575,11 +9480,7 @@ class GatewayRunner:
         if not mgr.is_active():
             return
 
-        decision = mgr.evaluate_after_turn(
-            final_response or "",
-            user_initiated=True,
-            messages=agent_messages or [],
-        )
+        decision = mgr.evaluate_after_turn(final_response or "", user_initiated=True)
         msg = decision.get("message") or ""
 
         # Defer the status line until after the adapter has delivered the
diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py
index e2727c25bab..3f543bba647 100644
--- a/hermes_cli/commands.py
+++ b/hermes_cli/commands.py
@@ -104,8 +104,6 @@ COMMAND_REGISTRY: list[CommandDef] = [
                args_hint="<prompt>"),
     CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session",
                args_hint="[text | pause | resume | clear | status]"),
-    CommandDef("subgoal", "Add or manage checklist items on the active goal", "Session",
-               args_hint="[text | complete N | impossible N | undo N | remove N | clear]"),
     CommandDef("status", "Show session info", "Session"),
     CommandDef("whoami", "Show your slash command access (admin / user)", "Info"),
     CommandDef("profile", "Show active profile name and home directory", "Info"),
diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py
index 3bd869296a8..894cdddb01b 100644
--- a/hermes_cli/goals.py
+++ b/hermes_cli/goals.py
@@ -7,18 +7,6 @@ continuation prompt back into the same session and keeps working until the
 goal is done, turn budget is exhausted, the user pauses/clears it, or the
 user sends a new message (which takes priority and pauses the goal loop).
 
-Checklist mode (added 2026-05): when a goal is set, a Phase-A "decompose"
-call asks the judge to write an extremely detailed checklist of concrete
-completion criteria for that goal. On every subsequent turn (Phase B) the
-judge evaluates the agent's most recent output against EACH pending item
-and may flip pending → completed | impossible, or append new items it
-discovers along the way. The goal is done only when every checklist item
-is in a terminal status. This is much harsher than the freeform
-"is the goal done?" prompt and gives users a visible, verifiable progress
-surface via /subgoal. A bounded read_file tool loop lets the judge inspect
-the dumped conversation history when the snippet alone isn't enough to
-rule.
-
 State is persisted in SessionDB's ``state_meta`` table keyed by
 ``goal:<session_id>`` so ``/resume`` picks it up.
 
@@ -33,9 +21,6 @@ Design notes / invariants:
   prompt and also pauses the goal loop for that turn (we still re-judge
   after, so if the user's message happens to complete the goal the judge
   will say ``done``).
-- Stickiness: once an item is marked completed or impossible, only the user
-  (via /subgoal undo) can flip it back. Judge updates that try to regress
-  terminal items are silently ignored.
 - This module has zero hard dependency on ``cli.HermesCLI`` or the gateway
   runner — both wire the same ``GoalManager`` in.
 
@@ -46,12 +31,10 @@ from __future__ import annotations
 
 import json
 import logging
-import os
 import re
 import time
-from dataclasses import dataclass, field, asdict
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from dataclasses import dataclass, asdict
+from typing import Any, Dict, Optional, Tuple
 
 logger = logging.getLogger(__name__)
 
@@ -61,9 +44,8 @@ logger = logging.getLogger(__name__)
 # ──────────────────────────────────────────────────────────────────────
 
 DEFAULT_MAX_TURNS = 20
-DEFAULT_JUDGE_TIMEOUT = 60.0
-# Cap how much of the last response we send to the judge inline. The judge
-# can read the dumped conversation file via read_file if it needs more.
+DEFAULT_JUDGE_TIMEOUT = 30.0
+# Cap how much of the last response + recent messages we send to the judge.
 _JUDGE_RESPONSE_SNIPPET_CHARS = 4000
 # After this many consecutive judge *parse* failures (empty output / non-JSON),
 # the loop auto-pauses and points the user at the goal_judge config. API /
@@ -73,36 +55,8 @@ _JUDGE_RESPONSE_SNIPPET_CHARS = 4000
 # exhausted with every reply shaped like `judge returned empty response` or
 # `judge reply was not JSON`.
 DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3
-# Bound the Phase-B judge tool loop: if the judge keeps calling read_file
-# without ever emitting a verdict, cap it so we don't burn the model's budget.
-DEFAULT_MAX_JUDGE_TOOL_CALLS = 5
-# Cap a single read_file response so a judge that tries to read 100k lines
-# doesn't blow up its own context. Judge can paginate if needed.
-_JUDGE_READ_FILE_MAX_LINES = 400
-_JUDGE_READ_FILE_MAX_CHARS = 32_000
 
 
-# Status constants ────────────────────────────────────────────────────
-ITEM_PENDING = "pending"
-ITEM_COMPLETED = "completed"
-ITEM_IMPOSSIBLE = "impossible"
-TERMINAL_ITEM_STATUSES = frozenset({ITEM_COMPLETED, ITEM_IMPOSSIBLE})
-VALID_ITEM_STATUSES = frozenset({ITEM_PENDING, ITEM_COMPLETED, ITEM_IMPOSSIBLE})
-
-ITEM_MARKERS = {
-    ITEM_COMPLETED: "[x]",
-    ITEM_IMPOSSIBLE: "[!]",
-    ITEM_PENDING: "[ ]",
-}
-
-ADDED_BY_JUDGE = "judge"
-ADDED_BY_USER = "user"
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Continuation prompt
-# ──────────────────────────────────────────────────────────────────────
-
 CONTINUATION_PROMPT_TEMPLATE = (
     "[Continuing toward your standing goal]\n"
     "Goal: {goal}\n\n"
@@ -111,57 +65,8 @@ CONTINUATION_PROMPT_TEMPLATE = (
     "If you are blocked and need input from the user, say so clearly and stop."
 )
 
-CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE = (
-    "[Continuing toward your standing goal]\n"
-    "Goal: {goal}\n\n"
-    "Checklist progress ({done}/{total} done):\n"
-    "{checklist}\n\n"
-    "Work on the unchecked items above. Do not declare items done yourself "
-    "— a judge marks them based on evidence in your output. If an item is "
-    "genuinely impossible in this environment, explain why so the judge can "
-    "mark it impossible. If you are blocked on a remaining item and need "
-    "user input, say so clearly and stop."
-)
 
-
-# ──────────────────────────────────────────────────────────────────────
-# Phase-A: decompose prompts
-# ──────────────────────────────────────────────────────────────────────
-
-DECOMPOSE_SYSTEM_PROMPT = (
-    "You are a strict judge for an autonomous agent. Your first job, before "
-    "judging anything, is to break the user's stated goal into an EXTREMELY "
-    "DETAILED checklist of concrete, verifiable completion criteria. Each "
-    "item must be specific enough that a third party reading the agent's "
-    "output could decide unambiguously whether that item was achieved.\n\n"
-    "Be exhaustive. Bias toward MORE items, not fewer. Include sub-items, "
-    "edge cases, quality bars, deployment steps, verification checks, and "
-    "anything the user would reasonably expect from a goal of this type. "
-    "If the user said 'build me a website' you should be enumerating "
-    "homepage exists, navigation links work, content is non-placeholder, "
-    "mobile responsive, accessibility tags present, deployed somewhere "
-    "publicly accessible, domain/URL is functional, etc. Better to "
-    "over-specify and let a few items get marked impossible than to "
-    "under-specify and let the agent declare victory early.\n\n"
-    "Submit your checklist by calling the ``submit_checklist`` tool. Do "
-    "not reply with prose or JSON in your message body — call the tool. "
-    "The system will not see anything you write outside the tool call."
-)
-
-DECOMPOSE_USER_PROMPT_TEMPLATE = (
-    "Goal:\n{goal}\n\n"
-    "Produce the harshest, most detailed checklist of completion criteria "
-    "you can. Aim for at least 5 items; more is better when warranted. "
-    "Each item should be a single verifiable statement of fact about the "
-    "finished work."
-)
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Phase-B: evaluate prompts
-# ──────────────────────────────────────────────────────────────────────
-
-EVALUATE_SYSTEM_PROMPT_FREEFORM = (
+JUDGE_SYSTEM_PROMPT = (
     "You are a strict judge evaluating whether an autonomous agent has "
     "achieved a user's stated goal. You receive the goal text and the "
     "agent's most recent response. Your only job is to decide whether "
@@ -173,57 +78,11 @@ EVALUATE_SYSTEM_PROMPT_FREEFORM = (
     "user input (treat this as DONE with reason describing the block).\n\n"
     "Otherwise the goal is NOT done — CONTINUE.\n\n"
     "Reply ONLY with a single JSON object on one line:\n"
-    '{"done": <true|false>, "reason": "<one-sentence rationale>"}'
+    '{\"done\": <true|false>, \"reason\": \"<one-sentence rationale>\"}'
 )
 
-EVALUATE_SYSTEM_PROMPT_CHECKLIST = (
-    "You are a strict judge evaluating an autonomous agent's progress on "
-    "a user's goal that has a detailed checklist of completion criteria. "
-    "For EACH currently-pending checklist item, decide whether the "
-    "available evidence shows the item is satisfied.\n\n"
-    "Be strict but not absurd. Default to leaving items pending UNLESS "
-    "evidence is reasonably clear. Reasonable evidence includes:\n"
-    "- The agent's most recent response describing or showing the work\n"
-    "- Tool call results visible in the conversation history (file writes, "
-    "command output, web requests, etc.)\n"
-    "- A clear statement by the agent that the work was done, when "
-    "supported by tool output earlier in the conversation\n\n"
-    "Do NOT require the agent to re-prove items it has already established "
-    "in earlier turns. If a tool call earlier in the conversation already "
-    "wrote a file, you do not need fresh `ls` output every turn — once "
-    "established, it's done.\n\n"
-    "Flip pending → completed when the response or recent tool calls show "
-    "the item is satisfied. Flip pending → impossible only when the work "
-    "demonstrates the item cannot be achieved in this environment (NOT "
-    "merely that the agent didn't try). Vague intentions ('I will do X "
-    "next') do NOT count as completion.\n\n"
-    "STICKINESS: items already marked completed or impossible are frozen. "
-    "Do not include them in your updates. Only the user can revert them.\n\n"
-    "TOOLS:\n"
-    "- ``read_file(path, offset, limit)``: inspect the dumped conversation "
-    "history file whose path is given in the user message. Use this when "
-    "the snippet alone isn't enough to rule. Each call costs tokens, so "
-    "only read when needed.\n"
-    "- ``update_checklist(updates, new_items, reason)``: issue your "
-    "verdict. Call this exactly once per turn when you are ready to rule. "
-    "Calling it ENDS the evaluation.\n\n"
-    "You MUST call one of these tools every turn. Do not reply with "
-    "prose or JSON in your message body — the system will not see "
-    "anything written outside tool calls. When you cite evidence, "
-    "reference the agent's actual output specifically."
-)
 
-EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE = (
-    "Goal:\n{goal}\n\n"
-    "Current checklist (each item is numbered, 1-based — use these "
-    "exact 1-based numbers as the ``index`` field in your updates):\n{checklist_block}\n\n"
-    "Agent's most recent response (snippet):\n{response}\n\n"
-    "Conversation history file (call read_file on this path if you need "
-    "more context — pagination supported via offset/limit):\n{history_path}\n\n"
-    "Evaluate each pending item. Cite specific evidence."
-)
-
-EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE = (
+JUDGE_USER_PROMPT_TEMPLATE = (
     "Goal:\n{goal}\n\n"
     "Agent's most recent response:\n{response}\n\n"
     "Is the goal satisfied?"
@@ -231,55 +90,16 @@ EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE = (
 
 
 # ──────────────────────────────────────────────────────────────────────
-# Dataclasses
+# Dataclass
 # ──────────────────────────────────────────────────────────────────────
 
 
-@dataclass
-class ChecklistItem:
-    """One concrete completion criterion attached to a goal."""
-
-    text: str
-    status: str = ITEM_PENDING            # pending | completed | impossible
-    added_by: str = ADDED_BY_JUDGE        # judge | user
-    added_at: float = 0.0
-    completed_at: Optional[float] = None
-    evidence: Optional[str] = None        # judge's rationale on flip
-
-    def to_dict(self) -> Dict[str, Any]:
-        return asdict(self)
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "ChecklistItem":
-        text = str(data.get("text", "")).strip()
-        if not text:
-            text = "(empty item)"
-        status = str(data.get("status", ITEM_PENDING)).strip().lower()
-        if status not in VALID_ITEM_STATUSES:
-            status = ITEM_PENDING
-        added_by = str(data.get("added_by", ADDED_BY_JUDGE)).strip().lower()
-        if added_by not in (ADDED_BY_JUDGE, ADDED_BY_USER):
-            added_by = ADDED_BY_JUDGE
-        return cls(
-            text=text,
-            status=status,
-            added_by=added_by,
-            added_at=float(data.get("added_at", 0.0) or 0.0),
-            completed_at=(
-                float(data["completed_at"])
-                if data.get("completed_at") is not None
-                else None
-            ),
-            evidence=data.get("evidence"),
-        )
-
-
 @dataclass
 class GoalState:
     """Serializable goal state stored per session."""
 
     goal: str
-    status: str = "active"                    # active | paused | done | cleared
+    status: str = "active"          # active | paused | done | cleared
     turns_used: int = 0
     max_turns: int = DEFAULT_MAX_TURNS
     created_at: float = 0.0
@@ -288,28 +108,13 @@ class GoalState:
     last_reason: Optional[str] = None
     paused_reason: Optional[str] = None       # why we auto-paused (budget, etc.)
     consecutive_parse_failures: int = 0       # judge-output parse failures in a row
-    # Checklist mode (added 2026-05). Both fields default safely so old
-    # state_meta rows load unchanged.
-    checklist: List[ChecklistItem] = field(default_factory=list)
-    decomposed: bool = False                  # has Phase-A run for this goal?
 
     def to_json(self) -> str:
-        data = asdict(self)
-        # asdict already serializes ChecklistItem via dataclass recursion.
-        return json.dumps(data, ensure_ascii=False)
+        return json.dumps(asdict(self), ensure_ascii=False)
 
     @classmethod
     def from_json(cls, raw: str) -> "GoalState":
         data = json.loads(raw)
-        raw_checklist = data.get("checklist") or []
-        checklist: List[ChecklistItem] = []
-        if isinstance(raw_checklist, list):
-            for item in raw_checklist:
-                if isinstance(item, dict):
-                    try:
-                        checklist.append(ChecklistItem.from_dict(item))
-                    except Exception:
-                        continue
         return cls(
             goal=data.get("goal", ""),
             status=data.get("status", "active"),
@@ -321,39 +126,8 @@ class GoalState:
             last_reason=data.get("last_reason"),
             paused_reason=data.get("paused_reason"),
             consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0),
-            checklist=checklist,
-            decomposed=bool(data.get("decomposed", False)),
         )
 
-    # --- checklist helpers ------------------------------------------------
-
-    def checklist_counts(self) -> Tuple[int, int, int, int]:
-        """Return (total, completed, impossible, pending)."""
-        total = len(self.checklist)
-        completed = sum(1 for it in self.checklist if it.status == ITEM_COMPLETED)
-        impossible = sum(1 for it in self.checklist if it.status == ITEM_IMPOSSIBLE)
-        pending = total - completed - impossible
-        return total, completed, impossible, pending
-
-    def all_terminal(self) -> bool:
-        """True iff at least one item exists and every item is in a terminal status."""
-        if not self.checklist:
-            return False
-        return all(it.status in TERMINAL_ITEM_STATUSES for it in self.checklist)
-
-    def render_checklist(self, *, numbered: bool = False) -> str:
-        if not self.checklist:
-            return "(empty)"
-        lines = []
-        for i, item in enumerate(self.checklist, start=1):
-            marker = ITEM_MARKERS.get(item.status, "[?]")
-            prefix = f"{i}. {marker}" if numbered else f"  {marker}"
-            line = f"{prefix} {item.text}"
-            if item.status == ITEM_IMPOSSIBLE and item.evidence:
-                line += f" (impossible: {item.evidence})"
-            lines.append(line)
-        return "\n".join(lines)
-
 
 # ──────────────────────────────────────────────────────────────────────
 # Persistence (SessionDB state_meta)
@@ -441,63 +215,7 @@ def clear_goal(session_id: str) -> None:
 
 
 # ──────────────────────────────────────────────────────────────────────
-# Conversation-history dump (read by the judge tool loop)
-# ──────────────────────────────────────────────────────────────────────
-
-
-def _goals_dump_dir() -> Optional[Path]:
-    """Return ``<HERMES_HOME>/goals`` (created on first use), or None on error."""
-    try:
-        from hermes_constants import get_hermes_home
-
-        home = Path(get_hermes_home())
-    except Exception as exc:
-        logger.debug("goals dump dir: get_hermes_home failed: %s", exc)
-        return None
-    try:
-        path = home / "goals"
-        path.mkdir(parents=True, exist_ok=True)
-        return path
-    except Exception as exc:
-        logger.debug("goals dump dir: mkdir failed: %s", exc)
-        return None
-
-
-def _safe_session_filename(session_id: str) -> str:
-    """Make a session_id safe for use as a filename component."""
-    cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", session_id or "unknown")
-    # Bound length to keep filesystem happy.
-    return cleaned[:128] or "unknown"
-
-
-def conversation_dump_path(session_id: str) -> Optional[Path]:
-    """Where the dumped messages JSON for ``session_id`` lives."""
-    base = _goals_dump_dir()
-    if base is None:
-        return None
-    return base / f"{_safe_session_filename(session_id)}.json"
-
-
-def dump_conversation(session_id: str, messages: List[Dict[str, Any]]) -> Optional[Path]:
-    """Write ``messages`` to the goals/ dump file. Returns the path on success."""
-    if not session_id or not messages:
-        return None
-    path = conversation_dump_path(session_id)
-    if path is None:
-        return None
-    try:
-        # Best-effort: messages may contain non-JSON-serializable objects from
-        # provider-specific adapter shims. Fall through with default=str.
-        with open(path, "w", encoding="utf-8") as fh:
-            json.dump(messages, fh, ensure_ascii=False, indent=2, default=str)
-        return path
-    except Exception as exc:
-        logger.debug("dump_conversation: write failed: %s", exc)
-        return None
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Judge: parsing helpers
+# Judge
 # ──────────────────────────────────────────────────────────────────────
 
 
@@ -509,34 +227,11 @@ def _truncate(text: str, limit: int) -> str:
     return text[:limit] + "… [truncated]"
 
 
-_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL)
-
-
-def _extract_json_object(raw: str) -> Optional[Dict[str, Any]]:
-    """Best-effort extraction of a single JSON object from a possibly-prosey reply."""
-    if not raw:
-        return None
-    text = raw.strip()
-    if text.startswith("```"):
-        text = text.strip("`")
-        nl = text.find("\n")
-        if nl != -1:
-            text = text[nl + 1:]
-    try:
-        data = json.loads(text)
-    except Exception:
-        match = _JSON_OBJECT_RE.search(text)
-        if not match:
-            return None
-        try:
-            data = json.loads(match.group(0))
-        except Exception:
-            return None
-    return data if isinstance(data, dict) else None
+_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)
 
 
 def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
-    """Parse the freeform judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.
+    """Parse the judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.
 
     Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the
     judge returned output that couldn't be interpreted as the expected JSON
@@ -547,8 +242,30 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
     if not raw:
         return False, "judge returned empty response", True
 
-    data = _extract_json_object(raw)
-    if data is None:
+    text = raw.strip()
+
+    # Strip markdown code fences the model may wrap JSON in.
+    if text.startswith("```"):
+        text = text.strip("`")
+        # Peel off leading json/JSON/etc tag
+        nl = text.find("\n")
+        if nl != -1:
+            text = text[nl + 1:]
+
+    # First try: parse the whole blob.
+    data: Optional[Dict[str, Any]] = None
+    try:
+        data = json.loads(text)
+    except Exception:
+        # Second try: pull the first JSON object out.
+        match = _JSON_OBJECT_RE.search(text)
+        if match:
+            try:
+                data = json.loads(match.group(0))
+            except Exception:
+                data = None
+
+    if not isinstance(data, dict):
         return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True
 
     done_val = data.get("done")
@@ -562,552 +279,49 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
     return done, reason, False
 
 
-def _parse_decompose_response(raw: str) -> Tuple[List[Dict[str, Any]], bool]:
-    """Parse a Phase-A decompose reply. Returns (items, parse_failed)."""
-    if not raw:
-        return [], True
-    data = _extract_json_object(raw)
-    if data is None:
-        return [], True
-    raw_items = data.get("checklist")
-    if not isinstance(raw_items, list):
-        return [], True
-    out: List[Dict[str, Any]] = []
-    for item in raw_items:
-        if isinstance(item, dict):
-            text = str(item.get("text", "")).strip()
-            if text:
-                out.append({"text": text})
-        elif isinstance(item, str):
-            text = item.strip()
-            if text:
-                out.append({"text": text})
-    return out, False
-
-
-def _parse_evaluate_response(raw: str) -> Tuple[Dict[str, Any], bool]:
-    """Parse a Phase-B checklist eval reply. Returns (parsed, parse_failed).
-
-    parsed = {"updates": [...], "new_items": [...], "reason": str}
-    """
-    if not raw:
-        return {"updates": [], "new_items": [], "reason": "judge returned empty response"}, True
-    data = _extract_json_object(raw)
-    if data is None:
-        return (
-            {
-                "updates": [],
-                "new_items": [],
-                "reason": f"judge reply was not JSON: {_truncate(raw, 200)!r}",
-            },
-            True,
-        )
-    updates = data.get("updates") or []
-    new_items = data.get("new_items") or []
-    reason = str(data.get("reason") or "").strip() or "no reason provided"
-    norm_updates = []
-    if isinstance(updates, list):
-        for upd in updates:
-            if not isinstance(upd, dict):
-                continue
-            try:
-                # Judge sees the checklist rendered with 1-based indices
-                # (matches the /subgoal CLI). Convert to 0-based here so the
-                # apply layer can index ``state.checklist`` directly.
-                idx_1based = int(upd.get("index"))
-            except (TypeError, ValueError):
-                continue
-            idx = idx_1based - 1
-            status = str(upd.get("status", "")).strip().lower()
-            if status not in TERMINAL_ITEM_STATUSES:
-                # Phase-B only accepts terminal flips. Pending → pending is a no-op.
-                continue
-            evidence = str(upd.get("evidence") or "").strip() or None
-            norm_updates.append({"index": idx, "status": status, "evidence": evidence})
-    norm_new = []
-    if isinstance(new_items, list):
-        for it in new_items:
-            if isinstance(it, dict):
-                text = str(it.get("text", "")).strip()
-                if text:
-                    norm_new.append({"text": text})
-            elif isinstance(it, str):
-                text = it.strip()
-                if text:
-                    norm_new.append({"text": text})
-    return {"updates": norm_updates, "new_items": norm_new, "reason": reason}, False
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Judge: read_file tool for the judge's bounded tool loop
-# ──────────────────────────────────────────────────────────────────────
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Judge tool schemas: read_file (history inspection) +
-# submit_checklist (Phase A) + update_checklist (Phase B)
-#
-# Forcing the judge to emit through tool calls is dramatically more
-# reliable than asking it to reply with JSON text. Most providers
-# enforce the schema server-side, so weak/small judge models can no
-# longer drift into prose, markdown fences, or empty bodies.
-# ──────────────────────────────────────────────────────────────────────
-
-
-_JUDGE_READ_FILE_TOOL_SCHEMA: Dict[str, Any] = {
-    "type": "function",
-    "function": {
-        "name": "read_file",
-        "description": (
-            "Read a portion of the dumped conversation history JSON file. "
-            "Use this when the snippet alone isn't enough to rule. Returns "
-            "lines from the file with 1-based line numbers. Pagination "
-            "supported via offset and limit. Reads beyond a built-in cap "
-            "are truncated."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "path": {
-                    "type": "string",
-                    "description": (
-                        "Absolute path to the conversation history file. "
-                        "You were given this in the user message."
-                    ),
-                },
-                "offset": {
-                    "type": "integer",
-                    "description": "1-indexed starting line number (default 1).",
-                    "default": 1,
-                },
-                "limit": {
-                    "type": "integer",
-                    "description": (
-                        f"Max lines to return (default {_JUDGE_READ_FILE_MAX_LINES})."
-                    ),
-                    "default": _JUDGE_READ_FILE_MAX_LINES,
-                },
-            },
-            "required": ["path"],
-        },
-    },
-}
-
-
-_JUDGE_SUBMIT_CHECKLIST_TOOL_SCHEMA: Dict[str, Any] = {
-    "type": "function",
-    "function": {
-        "name": "submit_checklist",
-        "description": (
-            "Submit the harsh, detailed completion-criteria checklist you "
-            "decomposed the goal into. Each item is one verifiable "
-            "completion criterion. Bias toward more items, not fewer."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "items": {
-                    "type": "array",
-                    "description": (
-                        "List of checklist items. Each item is a single "
-                        "verifiable statement of fact about the finished "
-                        "work. Aim for at least 5 items; more is better "
-                        "when warranted."
-                    ),
-                    "items": {
-                        "type": "object",
-                        "properties": {
-                            "text": {
-                                "type": "string",
-                                "description": "The completion-criterion text.",
-                            },
-                        },
-                        "required": ["text"],
-                    },
-                },
-            },
-            "required": ["items"],
-        },
-    },
-}
-
-
-_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA: Dict[str, Any] = {
-    "type": "function",
-    "function": {
-        "name": "update_checklist",
-        "description": (
-            "Issue your verdict on the current checklist. For each "
-            "currently-pending item, decide whether the agent's most "
-            "recent response (and conversation history if you read it) "
-            "shows the item is satisfied. You may also append new items "
-            "the original decomposition missed. Call this exactly once "
-            "when you are ready to rule — calling it ends the evaluation."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "updates": {
-                    "type": "array",
-                    "description": (
-                        "Per-item rulings. Use the 1-based ``index`` shown "
-                        "in the checklist. ``status`` must be 'completed' "
-                        "(clear evidence the item is done) or 'impossible' "
-                        "(item cannot be achieved in this environment). "
-                        "Items already in a terminal status are frozen — "
-                        "do not include them."
-                    ),
-                    "items": {
-                        "type": "object",
-                        "properties": {
-                            "index": {
-                                "type": "integer",
-                                "description": "1-based checklist index.",
-                            },
-                            "status": {
-                                "type": "string",
-                                "enum": ["completed", "impossible"],
-                            },
-                            "evidence": {
-                                "type": "string",
-                                "description": (
-                                    "One-sentence specific citation of why "
-                                    "this item is done or impossible. "
-                                    "Reference the agent's actual output."
-                                ),
-                            },
-                        },
-                        "required": ["index", "status", "evidence"],
-                    },
-                },
-                "new_items": {
-                    "type": "array",
-                    "description": (
-                        "Optional: completion criteria the original "
-                        "decomposition missed. Stay strict — only add "
-                        "items that genuinely belong as completion "
-                        "criteria for this goal."
-                    ),
-                    "items": {
-                        "type": "object",
-                        "properties": {
-                            "text": {
-                                "type": "string",
-                                "description": "The new criterion text.",
-                            },
-                        },
-                        "required": ["text"],
-                    },
-                },
-                "reason": {
-                    "type": "string",
-                    "description": "One-sentence overall rationale for this round of updates.",
-                },
-            },
-            "required": ["updates", "new_items", "reason"],
-        },
-    },
-}
-
-
-def _judge_read_file(
-    path: str,
-    *,
-    offset: int = 1,
-    limit: int = _JUDGE_READ_FILE_MAX_LINES,
-    allowed_path: Optional[Path] = None,
-) -> str:
-    """Bounded read of the dumped conversation file. Returns JSON-serializable text.
-
-    Restricted to ``allowed_path`` when provided — the judge cannot use this
-    tool to read arbitrary files.
-    """
-    if not path:
-        return json.dumps({"error": "path is required"})
-    try:
-        target = Path(path).resolve()
-    except Exception as exc:
-        return json.dumps({"error": f"path resolve failed: {exc}"})
-
-    if allowed_path is not None:
-        try:
-            allowed = allowed_path.resolve()
-        except Exception:
-            allowed = allowed_path
-        if target != allowed:
-            return json.dumps({
-                "error": (
-                    f"read_file is restricted to the conversation dump path. "
-                    f"Allowed: {allowed}"
-                )
-            })
-
-    if not target.exists():
-        return json.dumps({"error": f"file not found: {target}"})
-    try:
-        offset = max(1, int(offset or 1))
-        limit = max(1, min(int(limit or _JUDGE_READ_FILE_MAX_LINES), _JUDGE_READ_FILE_MAX_LINES))
-    except (TypeError, ValueError):
-        return json.dumps({"error": "offset and limit must be integers"})
-
-    try:
-        with open(target, "r", encoding="utf-8", errors="replace") as fh:
-            lines = fh.readlines()
-    except Exception as exc:
-        return json.dumps({"error": f"read failed: {exc}"})
-
-    total = len(lines)
-    start = offset - 1
-    end = min(start + limit, total)
-    slice_lines = lines[start:end]
-    out = "".join(slice_lines)
-    if len(out) > _JUDGE_READ_FILE_MAX_CHARS:
-        out = out[:_JUDGE_READ_FILE_MAX_CHARS] + "\n… [truncated by judge read cap]"
-    return json.dumps({
-        "path": str(target),
-        "total_lines": total,
-        "offset": offset,
-        "returned": len(slice_lines),
-        "next_offset": end + 1 if end < total else None,
-        "content": out,
-    }, ensure_ascii=False)
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Judge: phase-A (decompose) and phase-B (evaluate)
-# ──────────────────────────────────────────────────────────────────────
-
-
-def _get_judge_client() -> Tuple[Optional[Any], str]:
-    """Return (client, model) or (None, '') when unavailable."""
-    try:
-        from agent.auxiliary_client import get_text_auxiliary_client
-    except Exception as exc:
-        logger.debug("goal judge: auxiliary client import failed: %s", exc)
-        return None, ""
-    try:
-        client, model = get_text_auxiliary_client("goal_judge")
-    except Exception as exc:
-        logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc)
-        return None, ""
-    if client is None or not model:
-        return None, ""
-    return client, model
-
-
-def _extract_tool_call(msg: Any, tool_name: str) -> Optional[Dict[str, Any]]:
-    """Find a tool call by name on a chat-completions message. Returns
-    ``{"id", "name", "arguments": <dict>}`` or None.
-
-    Robust to provider shims that return tool_calls as objects or dicts
-    and arguments as JSON strings or already-parsed dicts.
-    """
-    tool_calls = getattr(msg, "tool_calls", None) or []
-    for tc in tool_calls:
-        try:
-            tc_id = getattr(tc, "id", None) or (tc.get("id") if isinstance(tc, dict) else None) or "tc-?"
-            fn = getattr(tc, "function", None) or (tc.get("function") if isinstance(tc, dict) else None)
-            if fn is None:
-                continue
-            fn_name = getattr(fn, "name", None) or (fn.get("name") if isinstance(fn, dict) else "")
-            if fn_name != tool_name:
-                continue
-            fn_args_raw = getattr(fn, "arguments", None) or (fn.get("arguments") if isinstance(fn, dict) else "")
-            if isinstance(fn_args_raw, str):
-                try:
-                    args = json.loads(fn_args_raw) if fn_args_raw else {}
-                except Exception:
-                    args = {}
-            elif isinstance(fn_args_raw, dict):
-                args = fn_args_raw
-            else:
-                args = {}
-            return {"id": tc_id, "name": fn_name, "arguments": args}
-        except Exception:
-            continue
-    return None
-
-
-def _serialize_assistant_tool_calls(msg: Any) -> List[Dict[str, Any]]:
-    """Convert a provider-shim tool_calls list into plain-dict form for
-    inclusion in subsequent ``messages=[...]`` payloads."""
-    out: List[Dict[str, Any]] = []
-    for tc in getattr(msg, "tool_calls", None) or []:
-        try:
-            tc_id = getattr(tc, "id", None) or (tc.get("id") if isinstance(tc, dict) else None) or "tc-?"
-            fn = getattr(tc, "function", None) or (tc.get("function") if isinstance(tc, dict) else None)
-            fn_name = getattr(fn, "name", None) or (fn.get("name") if isinstance(fn, dict) else "")
-            fn_args = getattr(fn, "arguments", None) or (fn.get("arguments") if isinstance(fn, dict) else "")
-            if not isinstance(fn_args, str):
-                try:
-                    fn_args = json.dumps(fn_args)
-                except Exception:
-                    fn_args = "{}"
-            out.append({
-                "id": tc_id,
-                "type": "function",
-                "function": {"name": fn_name or "", "arguments": fn_args},
-            })
-        except Exception:
-            continue
-    return out
-
-
-def _call_judge_with_tool_choice(
-    client: Any,
-    *,
-    model: str,
-    messages: List[Dict[str, Any]],
-    tools: List[Dict[str, Any]],
-    forced_tool_name: Optional[str],
-    timeout: float,
-    max_tokens: int = 1500,
-) -> Tuple[Optional[Any], Optional[str]]:
-    """Call the judge with a forced tool choice, falling back to ``auto``
-    if the provider rejects ``required`` / a specific function choice.
-
-    Returns ``(response, error)``. On success, ``error`` is None.
-    """
-    # First attempt: force the specific tool. Most modern providers
-    # support {"type": "function", "function": {"name": "..."}}.
-    primary_choice: Any
-    if forced_tool_name:
-        primary_choice = {"type": "function", "function": {"name": forced_tool_name}}
-    else:
-        primary_choice = "required"
-
-    attempts: List[Any] = [primary_choice, "required", "auto"]
-    last_err: Optional[str] = None
-    for choice in attempts:
-        try:
-            return client.chat.completions.create(
-                model=model,
-                messages=messages,
-                tools=tools,
-                tool_choice=choice,
-                temperature=0,
-                max_tokens=max_tokens,
-                timeout=timeout,
-            ), None
-        except Exception as exc:
-            last_err = f"{type(exc).__name__}: {exc}"
-            # Only retry on errors that look like the provider rejecting the
-            # tool_choice shape. Network errors etc. should bail immediately.
-            msg = str(exc).lower()
-            if not any(token in msg for token in (
-                "tool_choice", "tool choice", "required", "function call",
-                "unsupported", "not supported", "invalid", "400",
-            )):
-                return None, last_err
-            logger.debug("goal judge: tool_choice=%r rejected (%s); falling back", choice, exc)
-            continue
-    return None, last_err or "all tool_choice fallbacks failed"
-
-
-def decompose_goal(
-    goal: str,
-    *,
-    timeout: float = DEFAULT_JUDGE_TIMEOUT,
-) -> Tuple[List[Dict[str, Any]], Optional[str]]:
-    """Phase-A: ask the judge to break the goal into a checklist via a
-    forced ``submit_checklist`` tool call.
-
-    Returns ``(items, error)``. On any failure, returns ``([], reason)``
-    so the caller can fall back to freeform mode.
-    """
-    if not goal.strip():
-        return [], "empty goal"
-
-    client, model = _get_judge_client()
-    if client is None:
-        return [], "auxiliary client unavailable"
-
-    messages = [
-        {"role": "system", "content": DECOMPOSE_SYSTEM_PROMPT},
-        {
-            "role": "user",
-            "content": DECOMPOSE_USER_PROMPT_TEMPLATE.format(
-                goal=_truncate(goal, 4000)
-            ),
-        },
-    ]
-
-    resp, err = _call_judge_with_tool_choice(
-        client,
-        model=model,
-        messages=messages,
-        tools=[_JUDGE_SUBMIT_CHECKLIST_TOOL_SCHEMA],
-        forced_tool_name="submit_checklist",
-        timeout=timeout,
-        max_tokens=2000,
-    )
-    if resp is None:
-        logger.info("goal decompose: API call failed (%s)", err)
-        return [], f"decompose error: {err}"
-
-    try:
-        msg = resp.choices[0].message
-    except Exception:
-        return [], "decompose response malformed"
-
-    tc = _extract_tool_call(msg, "submit_checklist")
-    if tc is None:
-        # Provider responded but didn't call the tool. Try parsing content
-        # as a last-ditch backstop so a fully-broken provider doesn't
-        # silently leave the user with no checklist at all.
-        content = getattr(msg, "content", "") or ""
-        items, parse_failed = _parse_decompose_response(content)
-        if parse_failed or not items:
-            logger.info(
-                "goal decompose: no submit_checklist tool call AND no parseable JSON (raw=%r)",
-                _truncate(content, 200),
-            )
-            return [], "decompose: judge did not call submit_checklist"
-        logger.info("goal decompose: fell back to JSON-content parser (%d items)", len(items))
-        return items, None
-
-    raw_items = tc["arguments"].get("items") or []
-    items: List[Dict[str, Any]] = []
-    if isinstance(raw_items, list):
-        for entry in raw_items:
-            if isinstance(entry, dict):
-                text = str(entry.get("text", "")).strip()
-                if text:
-                    items.append({"text": text})
-            elif isinstance(entry, str):
-                text = entry.strip()
-                if text:
-                    items.append({"text": text})
-
-    if not items:
-        logger.info("goal decompose: submit_checklist returned empty items list")
-        return [], "decompose: empty checklist"
-
-    logger.info("goal decompose: produced %d checklist items via tool call", len(items))
-    return items, None
-
-
-def judge_goal_freeform(
+def judge_goal(
     goal: str,
     last_response: str,
     *,
     timeout: float = DEFAULT_JUDGE_TIMEOUT,
 ) -> Tuple[str, str, bool]:
-    """Legacy freeform judge — kept for goals with no checklist.
+    """Ask the auxiliary model whether the goal is satisfied.
 
     Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``,
-    ``"continue"``, or ``"skipped"``.
+    ``"continue"``, or ``"skipped"`` (when the judge couldn't be reached).
+
+    ``parse_failed`` is True only when the judge call succeeded but its output
+    was unusable (empty or non-JSON). API/transport errors return False — they
+    are transient and should fail-open silently. Callers use this flag to
+    auto-pause after N consecutive parse failures (see
+    ``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``).
+
+    This is deliberately fail-open: any error returns ``("continue", "...", False)``
+    so a broken judge doesn't wedge progress — the turn budget and the
+    consecutive-parse-failures auto-pause are the backstops.
     """
     if not goal.strip():
         return "skipped", "empty goal", False
     if not last_response.strip():
+        # No substantive reply this turn — almost certainly not done yet.
         return "continue", "empty response (nothing to evaluate)", False
 
-    client, model = _get_judge_client()
-    if client is None:
+    try:
+        from agent.auxiliary_client import get_text_auxiliary_client
+    except Exception as exc:
+        logger.debug("goal judge: auxiliary client import failed: %s", exc)
         return "continue", "auxiliary client unavailable", False
 
-    prompt = EVALUATE_USER_PROMPT_FREEFORM_TEMPLATE.format(
+    try:
+        client, model = get_text_auxiliary_client("goal_judge")
+    except Exception as exc:
+        logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc)
+        return "continue", "auxiliary client unavailable", False
+
+    if client is None or not model:
+        return "continue", "no auxiliary client configured", False
+
+    prompt = JUDGE_USER_PROMPT_TEMPLATE.format(
         goal=_truncate(goal, 2000),
         response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS),
     )
@@ -1116,7 +330,7 @@ def judge_goal_freeform(
         resp = client.chat.completions.create(
             model=model,
             messages=[
-                {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_FREEFORM},
+                {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
                 {"role": "user", "content": prompt},
             ],
             temperature=0,
@@ -1134,218 +348,10 @@ def judge_goal_freeform(
 
     done, reason, parse_failed = _parse_judge_response(raw)
     verdict = "done" if done else "continue"
-    logger.info("goal judge (freeform): verdict=%s reason=%s", verdict, _truncate(reason, 120))
+    logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120))
     return verdict, reason, parse_failed
 
 
-def evaluate_checklist(
-    state: GoalState,
-    last_response: str,
-    *,
-    history_path: Optional[Path],
-    timeout: float = DEFAULT_JUDGE_TIMEOUT,
-    max_tool_calls: int = DEFAULT_MAX_JUDGE_TOOL_CALLS,
-) -> Tuple[Dict[str, Any], bool]:
-    """Phase-B: judge evaluates each pending checklist item via forced
-    tool calls.
-
-    The judge has two tools available:
-      - ``read_file``: inspect the dumped conversation history
-      - ``update_checklist``: issue the verdict (terminates the loop)
-
-    ``tool_choice="required"`` forces one of them every iteration. We loop
-    until ``update_checklist`` is called or ``max_tool_calls`` is exhausted.
-
-    Returns ``(parsed, parse_failed)`` where parsed is
-    ``{"updates": [...], "new_items": [...], "reason": str}``.
-    Falls open on transport errors: empty updates/new_items, parse_failed=False.
-    """
-    client, model = _get_judge_client()
-    if client is None:
-        return ({"updates": [], "new_items": [], "reason": "auxiliary client unavailable"}, False)
-
-    # Render checklist with 1-based indices the judge addresses via the
-    # update_checklist tool's ``index`` field.
-    checklist_block = state.render_checklist(numbered=True)
-
-    user_prompt = EVALUATE_USER_PROMPT_CHECKLIST_TEMPLATE.format(
-        goal=_truncate(state.goal, 2000),
-        checklist_block=checklist_block,
-        response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS),
-        history_path=str(history_path) if history_path else "(unavailable — judge from snippet only)",
-    )
-
-    messages: List[Dict[str, Any]] = [
-        {"role": "system", "content": EVALUATE_SYSTEM_PROMPT_CHECKLIST},
-        {"role": "user", "content": user_prompt},
-    ]
-
-    # Build the toolbox: read_file is only useful when we actually have a
-    # history file to read, so we omit it otherwise to keep the schema lean.
-    tools: List[Dict[str, Any]] = [_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA]
-    if history_path is not None:
-        tools.insert(0, _JUDGE_READ_FILE_TOOL_SCHEMA)
-
-    reads_left = max(0, int(max_tool_calls)) if history_path is not None else 0
-
-    # Bound the overall loop generously — the judge will normally finish in
-    # one or two passes (read_file once, then update_checklist; or just
-    # update_checklist directly).
-    for iteration in range(reads_left + 2):
-        # When out of read budget, drop read_file from the toolbox so the
-        # judge MUST emit update_checklist.
-        loop_tools = tools if reads_left > 0 else [_JUDGE_UPDATE_CHECKLIST_TOOL_SCHEMA]
-        # Forcing update_checklist directly when reads are exhausted gives
-        # us the strongest guarantee of termination.
-        forced = "update_checklist" if reads_left <= 0 else None
-
-        resp, err = _call_judge_with_tool_choice(
-            client,
-            model=model,
-            messages=messages,
-            tools=loop_tools,
-            forced_tool_name=forced,
-            timeout=timeout,
-            max_tokens=1500,
-        )
-        if resp is None:
-            logger.info("goal judge (checklist): API call failed (%s)", err)
-            return (
-                {
-                    "updates": [],
-                    "new_items": [],
-                    "reason": f"judge error: {err}",
-                },
-                False,
-            )
-
-        try:
-            msg = resp.choices[0].message
-        except Exception:
-            return (
-                {"updates": [], "new_items": [], "reason": "judge response malformed"},
-                True,
-            )
-
-        # Did the judge call update_checklist? If yes, we're done.
-        update_tc = _extract_tool_call(msg, "update_checklist")
-        if update_tc is not None:
-            parsed = _normalize_update_args(update_tc["arguments"])
-            logger.info(
-                "goal judge (checklist): updates=%d new_items=%d reason=%s",
-                len(parsed.get("updates") or []),
-                len(parsed.get("new_items") or []),
-                _truncate(parsed.get("reason", ""), 120),
-            )
-            return parsed, False
-
-        # Did the judge call read_file? If yes, run it and feed the result back.
-        read_tc = _extract_tool_call(msg, "read_file")
-        if read_tc is not None and reads_left > 0:
-            args = read_tc["arguments"]
-            tool_result = _judge_read_file(
-                str(args.get("path", "")),
-                offset=args.get("offset", 1),
-                limit=args.get("limit", _JUDGE_READ_FILE_MAX_LINES),
-                allowed_path=history_path,
-            )
-            messages.append({
-                "role": "assistant",
-                "content": getattr(msg, "content", "") or "",
-                "tool_calls": _serialize_assistant_tool_calls(msg),
-            })
-            messages.append({
-                "role": "tool",
-                "tool_call_id": read_tc["id"],
-                "name": "read_file",
-                "content": tool_result,
-            })
-            reads_left -= 1
-            continue
-
-        # Neither tool was called. Try parsing the content body as a last-
-        # ditch backstop, then bail.
-        content = getattr(msg, "content", "") or ""
-        if content.strip():
-            parsed, parse_failed = _parse_evaluate_response(content)
-            if not parse_failed:
-                logger.info(
-                    "goal judge (checklist): fell back to JSON-content parser "
-                    "updates=%d new_items=%d",
-                    len(parsed.get("updates") or []),
-                    len(parsed.get("new_items") or []),
-                )
-                return parsed, False
-        logger.info(
-            "goal judge (checklist): judge emitted neither read_file nor "
-            "update_checklist (iteration=%d, content=%r) — bailing",
-            iteration, _truncate(content, 120),
-        )
-        return (
-            {
-                "updates": [],
-                "new_items": [],
-                "reason": "judge did not call update_checklist",
-            },
-            True,
-        )
-
-    # Loop exhausted without an update_checklist call.
-    return (
-        {
-            "updates": [],
-            "new_items": [],
-            "reason": "judge tool-loop exhausted without verdict",
-        },
-        True,
-    )
-
-
-def _normalize_update_args(args: Dict[str, Any]) -> Dict[str, Any]:
-    """Validate and normalize the ``update_checklist`` tool arguments.
-
-    Performs the same 1-based → 0-based conversion and terminal-status
-    filter as ``_parse_evaluate_response``. Returns the canonical
-    ``{updates, new_items, reason}`` shape callers expect.
-    """
-    raw_updates = args.get("updates") or []
-    raw_new = args.get("new_items") or []
-    reason = str(args.get("reason") or "").strip() or "no reason provided"
-
-    norm_updates: List[Dict[str, Any]] = []
-    if isinstance(raw_updates, list):
-        for upd in raw_updates:
-            if not isinstance(upd, dict):
-                continue
-            try:
-                idx_1based = int(upd.get("index"))
-            except (TypeError, ValueError):
-                continue
-            status = str(upd.get("status", "")).strip().lower()
-            if status not in TERMINAL_ITEM_STATUSES:
-                continue
-            evidence = str(upd.get("evidence") or "").strip() or None
-            norm_updates.append({
-                "index": idx_1based - 1,  # 1-based → 0-based for apply layer
-                "status": status,
-                "evidence": evidence,
-            })
-
-    norm_new: List[Dict[str, Any]] = []
-    if isinstance(raw_new, list):
-        for it in raw_new:
-            if isinstance(it, dict):
-                text = str(it.get("text", "")).strip()
-                if text:
-                    norm_new.append({"text": text})
-            elif isinstance(it, str):
-                text = it.strip()
-                if text:
-                    norm_new.append({"text": text})
-
-    return {"updates": norm_updates, "new_items": norm_new, "reason": reason}
-
-
 # ──────────────────────────────────────────────────────────────────────
 # GoalManager — the orchestration surface CLI + gateway talk to
 # ──────────────────────────────────────────────────────────────────────
@@ -1362,12 +368,8 @@ class GoalManager:
     - ``clear()`` — remove the active goal.
     - ``pause()`` / ``resume()`` — explicit user controls.
     - ``status()`` — printable one-liner.
-    - ``add_subgoal(text)`` — user appends a checklist item.
-    - ``mark_subgoal(index, status)`` — user flips an item (override).
-    - ``remove_subgoal(index)`` — user deletes an item.
-    - ``clear_checklist()`` — user wipes the checklist; next turn re-decomposes.
-    - ``evaluate_after_turn(last_response, agent=None)`` — call the judge,
-      update state, return a decision dict.
+    - ``evaluate_after_turn(last_response)`` — call the judge, update state,
+      and return a decision dict the caller uses to drive the next turn.
     - ``next_continuation_prompt()`` — the canonical user-role message to
       feed back into ``run_conversation``.
     """
@@ -1394,26 +396,14 @@ class GoalManager:
         if s is None or s.status in ("cleared",):
             return "No active goal. Set one with /goal <text>."
         turns = f"{s.turns_used}/{s.max_turns} turns"
-        cl_total, cl_done, cl_imp, _ = s.checklist_counts()
-        cl_text = ""
-        if cl_total:
-            cl_text = f", {cl_done + cl_imp}/{cl_total} done"
         if s.status == "active":
-            return f"⊙ Goal (active, {turns}{cl_text}): {s.goal}"
+            return f"⊙ Goal (active, {turns}): {s.goal}"
         if s.status == "paused":
             extra = f" — {s.paused_reason}" if s.paused_reason else ""
-            return f"⏸ Goal (paused, {turns}{cl_text}{extra}): {s.goal}"
+            return f"⏸ Goal (paused, {turns}{extra}): {s.goal}"
         if s.status == "done":
-            return f"✓ Goal done ({turns}{cl_text}): {s.goal}"
-        return f"Goal ({s.status}, {turns}{cl_text}): {s.goal}"
-
-    def render_checklist(self) -> str:
-        """Public helper for the /subgoal slash command."""
-        if self._state is None:
-            return "(no active goal)"
-        if not self._state.checklist:
-            return "(checklist empty — judge will populate it on the next turn)"
-        return self._state.render_checklist(numbered=True)
+            return f"✓ Goal done ({turns}): {s.goal}"
+        return f"Goal ({s.status}, {turns}): {s.goal}"
 
     # --- mutation -----------------------------------------------------
 
@@ -1428,8 +418,6 @@ class GoalManager:
             max_turns=int(max_turns) if max_turns else self.default_max_turns,
             created_at=time.time(),
             last_turn_at=0.0,
-            checklist=[],
-            decomposed=False,
         )
         self._state = state
         save_goal(self.session_id, state)
@@ -1468,77 +456,6 @@ class GoalManager:
         self._state.last_reason = reason
         save_goal(self.session_id, self._state)
 
-    # --- /subgoal user controls ---------------------------------------
-
-    def add_subgoal(self, text: str) -> ChecklistItem:
-        """User appends a new checklist item. Requires an active or paused goal."""
-        if self._state is None:
-            raise RuntimeError("no active goal")
-        text = (text or "").strip()
-        if not text:
-            raise ValueError("subgoal text is empty")
-        item = ChecklistItem(
-            text=text,
-            status=ITEM_PENDING,
-            added_by=ADDED_BY_USER,
-            added_at=time.time(),
-        )
-        self._state.checklist.append(item)
-        save_goal(self.session_id, self._state)
-        return item
-
-    def mark_subgoal(self, index_1based: int, status: str) -> ChecklistItem:
-        """User overrides an item's status.
-
-        ``status`` may be ``completed``, ``impossible``, or ``pending``
-        (the last only as an undo flow). Stickiness rules do NOT apply to
-        user actions — the user is the only authority that can revert
-        terminal items.
-        """
-        if self._state is None:
-            raise RuntimeError("no active goal")
-        status = (status or "").strip().lower()
-        if status not in VALID_ITEM_STATUSES:
-            raise ValueError(
-                f"status must be one of {sorted(VALID_ITEM_STATUSES)}; got {status!r}"
-            )
-        idx = int(index_1based) - 1
-        if idx < 0 or idx >= len(self._state.checklist):
-            raise IndexError(
-                f"index out of range (1..{len(self._state.checklist)})"
-            )
-        item = self._state.checklist[idx]
-        item.status = status
-        if status in TERMINAL_ITEM_STATUSES:
-            item.completed_at = time.time()
-            if not item.evidence:
-                item.evidence = "marked by user"
-        else:
-            item.completed_at = None
-            # Don't wipe judge-supplied evidence on undo — useful audit trail.
-        save_goal(self.session_id, self._state)
-        return item
-
-    def remove_subgoal(self, index_1based: int) -> ChecklistItem:
-        if self._state is None:
-            raise RuntimeError("no active goal")
-        idx = int(index_1based) - 1
-        if idx < 0 or idx >= len(self._state.checklist):
-            raise IndexError(
-                f"index out of range (1..{len(self._state.checklist)})"
-            )
-        removed = self._state.checklist.pop(idx)
-        save_goal(self.session_id, self._state)
-        return removed
-
-    def clear_checklist(self) -> None:
-        """Wipe the checklist and reset decomposed=False so the judge re-decomposes."""
-        if self._state is None:
-            return
-        self._state.checklist = []
-        self._state.decomposed = False
-        save_goal(self.session_id, self._state)
-
     # --- the main entry point called after every turn -----------------
 
     def evaluate_after_turn(
@@ -1546,8 +463,6 @@ class GoalManager:
         last_response: str,
         *,
         user_initiated: bool = True,
-        agent: Any = None,
-        messages: Optional[List[Dict[str, Any]]] = None,
     ) -> Dict[str, Any]:
         """Run the judge and update state. Return a decision dict.
 
@@ -1555,21 +470,11 @@ class GoalManager:
         continuation prompt we fed ourselves (False). Both increment
         ``turns_used`` because both consume model budget.
 
-        ``messages`` is the agent's full conversation list for this session.
-        When provided, it's dumped to ``<HERMES_HOME>/goals/<sid>.json`` so
-        the Phase-B judge's read_file tool can inspect history. Optional —
-        when missing, the judge runs from the snippet only.
-
-        ``agent`` is a back-compat path — when ``messages`` is None we try
-        to extract them from common AIAgent attribute names. Most callers
-        should pass ``messages`` directly because AIAgent does not store
-        the message list as a public instance attribute.
-
         Decision keys:
           - ``status``: current goal status after update
           - ``should_continue``: bool — caller should fire another turn
           - ``continuation_prompt``: str or None
-          - ``verdict``: "done" | "continue" | "skipped" | "inactive" | "decompose"
+          - ``verdict``: "done" | "continue" | "skipped" | "inactive"
           - ``reason``: str
           - ``message``: user-visible one-liner to print/send
         """
@@ -1588,45 +493,7 @@ class GoalManager:
         state.turns_used += 1
         state.last_turn_at = time.time()
 
-        # ── Phase A: decompose (first call after /goal set) ───────────
-        if not state.decomposed:
-            items, err = decompose_goal(state.goal)
-            state.decomposed = True
-            decompose_message = ""
-            if items:
-                now = time.time()
-                for entry in items:
-                    state.checklist.append(
-                        ChecklistItem(
-                            text=entry["text"],
-                            status=ITEM_PENDING,
-                            added_by=ADDED_BY_JUDGE,
-                            added_at=now,
-                        )
-                    )
-                state.last_verdict = "decompose"
-                state.last_reason = f"decomposed into {len(items)} items"
-                decompose_message = (
-                    f"⊙ Goal checklist created ({len(items)} items). "
-                    f"Use /subgoal to view or edit it."
-                )
-                save_goal(self.session_id, state)
-                return {
-                    "status": "active",
-                    "should_continue": True,
-                    "continuation_prompt": self.next_continuation_prompt(),
-                    "verdict": "decompose",
-                    "reason": state.last_reason,
-                    "message": decompose_message,
-                }
-            # Decompose failed — fall through to freeform mode below.
-            logger.info("goal: decompose failed (%s) — falling back to freeform judge", err)
-            state.last_reason = f"decompose failed: {err}"
-
-        # ── Phase B: evaluate ────────────────────────────────────────
-        verdict, reason, parse_failed = self._evaluate_state_phase_b(
-            state, last_response, agent=agent, messages=messages
-        )
+        verdict, reason, parse_failed = judge_goal(state.goal, last_response)
         state.last_verdict = verdict
         state.last_reason = reason
 
@@ -1651,7 +518,11 @@ class GoalManager:
             }
 
         # Auto-pause when the judge model can't produce the expected JSON
-        # verdict N turns in a row.
+        # verdict N turns in a row. Points the user at the goal_judge config
+        # so they can route this side task to a model that follows the
+        # contract (e.g. google/gemini-3-flash-preview). Without this guard,
+        # weak judge models burn the entire turn budget returning prose or
+        # empty strings.
         if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES:
             state.status = "paused"
             state.paused_reason = (
@@ -1693,10 +564,6 @@ class GoalManager:
             }
 
         save_goal(self.session_id, state)
-        cl_total, cl_done, cl_imp, _ = state.checklist_counts()
-        progress = ""
-        if cl_total:
-            progress = f" — {cl_done + cl_imp}/{cl_total} done"
         return {
             "status": "active",
             "should_continue": True,
@@ -1704,168 +571,23 @@ class GoalManager:
             "verdict": "continue",
             "reason": reason,
             "message": (
-                f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}{progress}): {reason}"
+                f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}): {reason}"
             ),
         }
 
-    def _evaluate_state_phase_b(
-        self,
-        state: GoalState,
-        last_response: str,
-        *,
-        agent: Any = None,
-        messages: Optional[List[Dict[str, Any]]] = None,
-    ) -> Tuple[str, str, bool]:
-        """Run the right kind of Phase-B evaluation given current state.
-
-        With a non-empty checklist: harsh per-item evaluation with a bounded
-        read_file tool loop.
-
-        With an empty checklist (e.g. decompose failed twice): fall back to
-        the legacy freeform judge so the goal still has a way to terminate.
-        """
-        if not last_response.strip():
-            return "continue", "empty response (nothing to evaluate)", False
-
-        if state.checklist:
-            # Dump conversation history if we have one. Prefer explicit
-            # ``messages`` arg (most reliable); fall back to extracting from
-            # the agent instance for back-compat.
-            history_path: Optional[Path] = None
-            msgs: List[Dict[str, Any]] = []
-            if messages:
-                msgs = list(messages)
-            elif agent is not None:
-                msgs = self._extract_agent_messages(agent)
-            if msgs:
-                history_path = dump_conversation(self.session_id, msgs)
-                if history_path is None:
-                    logger.debug(
-                        "goal: conversation dump failed for session %s",
-                        self.session_id,
-                    )
-            else:
-                logger.debug(
-                    "goal: no messages available for session %s — judge will run from snippet only",
-                    self.session_id,
-                )
-
-            parsed, parse_failed = evaluate_checklist(
-                state, last_response, history_path=history_path
-            )
-            self._apply_checklist_updates(state, parsed)
-
-            if state.all_terminal():
-                return "done", parsed.get("reason") or "all checklist items terminal", parse_failed
-            return "continue", parsed.get("reason") or "checklist progress", parse_failed
-
-        # No checklist — freeform fallback.
-        verdict, reason, parse_failed = judge_goal_freeform(state.goal, last_response)
-        return verdict, reason, parse_failed
-
-    # --- internal helpers ---------------------------------------------
-
-    @staticmethod
-    def _extract_agent_messages(agent: Any) -> List[Dict[str, Any]]:
-        """Best-effort extraction of the agent's conversation history.
-
-        Tries common attribute names so we don't tightly couple to AIAgent.
-        Returns an empty list when nothing is available.
-        """
-        for attr in ("messages", "conversation_history", "_messages", "history"):
-            try:
-                msgs = getattr(agent, attr, None)
-                if isinstance(msgs, list) and msgs:
-                    return msgs
-            except Exception:
-                continue
-        return []
-
-    @staticmethod
-    def _apply_checklist_updates(state: GoalState, parsed: Dict[str, Any]) -> None:
-        """Apply judge updates with stickiness: never regress terminal items."""
-        now = time.time()
-        for upd in parsed.get("updates") or []:
-            try:
-                idx = int(upd["index"])
-            except (KeyError, TypeError, ValueError):
-                continue
-            if idx < 0 or idx >= len(state.checklist):
-                continue
-            item = state.checklist[idx]
-            if item.status in TERMINAL_ITEM_STATUSES:
-                # Stickiness: judge cannot regress a terminal item.
-                continue
-            new_status = upd.get("status")
-            if new_status not in TERMINAL_ITEM_STATUSES:
-                continue
-            item.status = new_status
-            item.completed_at = now
-            evidence = upd.get("evidence")
-            if evidence:
-                item.evidence = evidence
-
-        for new_item in parsed.get("new_items") or []:
-            text = (new_item.get("text") or "").strip()
-            if not text:
-                continue
-            state.checklist.append(
-                ChecklistItem(
-                    text=text,
-                    status=ITEM_PENDING,
-                    added_by=ADDED_BY_JUDGE,
-                    added_at=now,
-                )
-            )
-
-    # --- continuation prompt ------------------------------------------
-
     def next_continuation_prompt(self) -> Optional[str]:
         if not self._state or self._state.status != "active":
             return None
-        if not self._state.checklist:
-            return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal)
-        cl_total, cl_done, cl_imp, _ = self._state.checklist_counts()
-        return CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE.format(
-            goal=self._state.goal,
-            done=cl_done + cl_imp,
-            total=cl_total,
-            checklist=self._state.render_checklist(numbered=False),
-        )
-
-
-# Public name kept for back-compat with the previous freeform-only API.
-def judge_goal(
-    goal: str,
-    last_response: str,
-    *,
-    timeout: float = DEFAULT_JUDGE_TIMEOUT,
-) -> Tuple[str, str, bool]:
-    """Back-compat wrapper — defers to the freeform judge."""
-    return judge_goal_freeform(goal, last_response, timeout=timeout)
+        return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal)
 
 
 __all__ = [
-    "ChecklistItem",
     "GoalState",
     "GoalManager",
     "CONTINUATION_PROMPT_TEMPLATE",
-    "CONTINUATION_PROMPT_WITH_CHECKLIST_TEMPLATE",
     "DEFAULT_MAX_TURNS",
-    "DEFAULT_MAX_JUDGE_TOOL_CALLS",
-    "ITEM_PENDING",
-    "ITEM_COMPLETED",
-    "ITEM_IMPOSSIBLE",
-    "ITEM_MARKERS",
-    "TERMINAL_ITEM_STATUSES",
-    "VALID_ITEM_STATUSES",
     "load_goal",
     "save_goal",
     "clear_goal",
     "judge_goal",
-    "judge_goal_freeform",
-    "decompose_goal",
-    "evaluate_checklist",
-    "conversation_dump_path",
-    "dump_conversation",
 ]
diff --git a/run_agent.py b/run_agent.py
index 3425c380492..cb39669ece7 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -10089,25 +10089,6 @@ class AIAgent:
                     parent_session_id=old_session_id,
                 )
                 self._session_db_created = True
-                # Forward any standing /goal state from the parent session to
-                # the continuation session so the goal loop survives
-                # auto-compression. Without this rebind, _get_goal_manager()
-                # constructs a fresh manager keyed on the new session_id,
-                # load_goal() returns None, mgr.is_active() is False, and
-                # the loop silently dies mid-task. The goal is stored in
-                # state_meta under "goal:<sid>" by hermes_cli.goals.
-                try:
-                    _goal_meta_key_old = f"goal:{old_session_id}"
-                    _goal_meta_key_new = f"goal:{self.session_id}"
-                    _goal_blob = self._session_db.get_meta(_goal_meta_key_old)
-                    if _goal_blob:
-                        self._session_db.set_meta(_goal_meta_key_new, _goal_blob)
-                        logger.info(
-                            "goal: forwarded standing goal from %s → %s on compression",
-                            old_session_id, self.session_id,
-                        )
-                except Exception as exc:
-                    logger.debug("goal forward on compression failed: %s", exc)
                 # Auto-number the title for the continuation session
                 if old_title:
                     try:
diff --git a/tests/cli/test_cli_goal_interrupt.py b/tests/cli/test_cli_goal_interrupt.py
index 879e87c6a73..851b87e856b 100644
--- a/tests/cli/test_cli_goal_interrupt.py
+++ b/tests/cli/test_cli_goal_interrupt.py
@@ -58,11 +58,6 @@ def _make_cli_with_goal(session_id: str, goal_text: str = "build a thing"):
 
     mgr = GoalManager(session_id=session_id, default_max_turns=5)
     mgr.set(goal_text)
-    # Skip Phase-A decompose so tests can patch judge_goal_freeform directly
-    # for legacy verdict assertions.
-    mgr.state.decomposed = True
-    from hermes_cli.goals import save_goal as _sg
-    _sg(mgr.session_id, mgr.state)
     cli._goal_manager = mgr
     return cli, mgr
 
@@ -86,7 +81,7 @@ class TestInterruptAutoPause:
 
         # Judge MUST NOT run on an interrupted turn. If it does, we've
         # regressed — fail loudly instead of silently querying a mock.
-        with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock:
+        with patch("hermes_cli.goals.judge_goal") as judge_mock:
             judge_mock.side_effect = AssertionError(
                 "judge_goal called on an interrupted turn"
             )
@@ -111,7 +106,7 @@ class TestInterruptAutoPause:
         cli.conversation_history = [
             {"role": "assistant", "content": "partial"},
         ]
-        with patch("hermes_cli.goals.judge_goal_freeform"):
+        with patch("hermes_cli.goals.judge_goal"):
             cli._maybe_continue_goal_after_turn()
         assert mgr.state.status == "paused"
 
@@ -130,7 +125,7 @@ class TestEmptyResponseSkip:
             {"role": "assistant", "content": "   \n\n   "},
         ]
 
-        with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock:
+        with patch("hermes_cli.goals.judge_goal") as judge_mock:
             judge_mock.side_effect = AssertionError(
                 "judge_goal called on an empty response"
             )
@@ -149,7 +144,7 @@ class TestEmptyResponseSkip:
             {"role": "user", "content": "go"},
         ]
 
-        with patch("hermes_cli.goals.judge_goal_freeform") as judge_mock:
+        with patch("hermes_cli.goals.judge_goal") as judge_mock:
             judge_mock.side_effect = AssertionError(
                 "judge_goal called without an assistant response"
             )
@@ -174,7 +169,7 @@ class TestHealthyTurnStillRuns:
 
         # Force the judge to say "continue" without touching the network.
         with patch(
-            "hermes_cli.goals.judge_goal_freeform",
+            "hermes_cli.goals.judge_goal",
             return_value=("continue", "needs more steps", False),
         ):
             cli._maybe_continue_goal_after_turn()
@@ -194,7 +189,7 @@ class TestHealthyTurnStillRuns:
         ]
 
         with patch(
-            "hermes_cli.goals.judge_goal_freeform",
+            "hermes_cli.goals.judge_goal",
             return_value=("done", "goal satisfied", False),
         ):
             cli._maybe_continue_goal_after_turn()
diff --git a/tests/gateway/test_goal_verdict_send.py b/tests/gateway/test_goal_verdict_send.py
index ce9e8d6ad1e..14f536aa4f8 100644
--- a/tests/gateway/test_goal_verdict_send.py
+++ b/tests/gateway/test_goal_verdict_send.py
@@ -106,11 +106,8 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home):
 
     mgr = GoalManager(session_entry.session_id)
     mgr.set("ship the feature")
-    mgr.state.decomposed = True
-    from hermes_cli.goals import save_goal as _sg
-    _sg(mgr.session_id, mgr.state)
 
-    with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "the feature shipped", False)):
+    with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)):
         await runner._post_turn_goal_continuation(
             session_entry=session_entry,
             source=src,
@@ -138,11 +135,8 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home):
 
     mgr = GoalManager(session_entry.session_id)
     mgr.set("polish the docs")
-    mgr.state.decomposed = True
-    from hermes_cli.goals import save_goal as _sg
-    _sg(mgr.session_id, mgr.state)
 
-    with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "still needs work", False)):
+    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)):
         await runner._post_turn_goal_continuation(
             session_entry=session_entry,
             source=src,
@@ -170,7 +164,7 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home):
     state.turns_used = 2
     save_goal(session_entry.session_id, state)
 
-    with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "keep going", False)):
+    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)):
         await runner._post_turn_goal_continuation(
             session_entry=session_entry,
             source=src,
@@ -217,7 +211,7 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home):
 
     runner.adapters[Platform.TELEGRAM] = _NoSendAdapter()
 
-    with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "ok", False)):
+    with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)):
         # must not raise
         await runner._post_turn_goal_continuation(
             session_entry=session_entry,
diff --git a/tests/hermes_cli/test_goals.py b/tests/hermes_cli/test_goals.py
index 680bf5ec1c8..b5afd716c9e 100644
--- a/tests/hermes_cli/test_goals.py
+++ b/tests/hermes_cli/test_goals.py
@@ -253,20 +253,14 @@ class TestGoalManager:
         assert mgr2.is_active()
 
     def test_evaluate_after_turn_done(self, hermes_home):
-        """Judge says done → status=done, no continuation.
-
-        Skips Phase-A decompose by patching ``decompose_goal`` to return
-        an empty checklist so the manager falls through to the freeform
-        judge path (legacy behavior preserved when decompose is unavailable).
-        """
+        """Judge says done → status=done, no continuation."""
         from hermes_cli import goals
         from hermes_cli.goals import GoalManager
 
         mgr = GoalManager(session_id="eval-sid-1")
         mgr.set("ship it")
 
-        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
-             patch.object(goals, "judge_goal_freeform", return_value=("done", "shipped", False)):
+        with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)):
             decision = mgr.evaluate_after_turn("I shipped the feature.")
 
         assert decision["verdict"] == "done"
@@ -282,8 +276,7 @@ class TestGoalManager:
         mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
         mgr.set("a long goal")
 
-        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
-             patch.object(goals, "judge_goal_freeform", return_value=("continue", "more work", False)):
+        with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)):
             decision = mgr.evaluate_after_turn("made some progress")
 
         assert decision["verdict"] == "continue"
@@ -301,8 +294,7 @@ class TestGoalManager:
         mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
         mgr.set("hard goal")
 
-        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
-             patch.object(goals, "judge_goal_freeform", return_value=("continue", "not yet", False)):
+        with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)):
             d1 = mgr.evaluate_after_turn("step 1")
             assert d1["should_continue"] is True
             assert mgr.state.turns_used == 1
@@ -442,11 +434,9 @@ class TestJudgeParseFailureAutoPause:
         mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20)
         mgr.set("do a thing")
 
-        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
-             patch.object(
-                 goals, "judge_goal_freeform",
-                 return_value=("continue", "judge returned empty response", True),
-             ):
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "judge returned empty response", True)
+        ):
             d1 = mgr.evaluate_after_turn("step 1")
             assert d1["should_continue"] is True
             assert mgr.state.consecutive_parse_failures == 1
@@ -473,21 +463,17 @@ class TestJudgeParseFailureAutoPause:
         mgr.set("another goal")
 
         # Two parse failures…
-        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
-             patch.object(
-                 goals, "judge_goal_freeform",
-                 return_value=("continue", "not json", True),
-             ):
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "not json", True)
+        ):
             mgr.evaluate_after_turn("step 1")
             mgr.evaluate_after_turn("step 2")
             assert mgr.state.consecutive_parse_failures == 2
 
         # …then one clean reply resets the counter.
-        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
-             patch.object(
-                 goals, "judge_goal_freeform",
-                 return_value=("continue", "making progress", False),
-             ):
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "making progress", False)
+        ):
             d = mgr.evaluate_after_turn("step 3")
             assert d["should_continue"] is True
             assert mgr.state.consecutive_parse_failures == 0
@@ -500,11 +486,9 @@ class TestJudgeParseFailureAutoPause:
         mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20)
         mgr.set("goal")
 
-        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
-             patch.object(
-                 goals, "judge_goal_freeform",
-                 return_value=("continue", "judge error: RuntimeError", False),
-             ):
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False)
+        ):
             for _ in range(5):
                 d = mgr.evaluate_after_turn("still going")
                 assert d["should_continue"] is True
@@ -521,857 +505,12 @@ class TestJudgeParseFailureAutoPause:
         mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20)
         mgr.set("persistent goal")
 
-        with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
-             patch.object(
-                 goals, "judge_goal_freeform",
-                 return_value=("continue", "empty", True),
-             ):
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "empty", True)
+        ):
             mgr.evaluate_after_turn("r")
             mgr.evaluate_after_turn("r")
 
         reloaded = load_goal("parse-fail-sid-4")
         assert reloaded is not None
         assert reloaded.consecutive_parse_failures == 2
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Checklist mode: GoalState backcompat + ChecklistItem
-# ──────────────────────────────────────────────────────────────────────
-
-
-class TestGoalStateBackcompat:
-    def test_old_state_meta_row_loads_without_checklist_fields(self):
-        """A goal serialized BEFORE the checklist fields existed must
-        round-trip through GoalState.from_json with empty defaults."""
-        from hermes_cli.goals import GoalState
-
-        legacy_json = json.dumps({
-            "goal": "do the thing",
-            "status": "active",
-            "turns_used": 3,
-            "max_turns": 20,
-            "created_at": 1.0,
-            "last_turn_at": 2.0,
-            "last_verdict": "continue",
-            "last_reason": "still working",
-            "paused_reason": None,
-            "consecutive_parse_failures": 1,
-        })
-        state = GoalState.from_json(legacy_json)
-        assert state.goal == "do the thing"
-        assert state.checklist == []
-        assert state.decomposed is False
-
-    def test_new_state_round_trip(self):
-        from hermes_cli.goals import (
-            ChecklistItem,
-            GoalState,
-            ITEM_COMPLETED,
-            ITEM_PENDING,
-            ADDED_BY_JUDGE,
-            ADDED_BY_USER,
-        )
-
-        state = GoalState(
-            goal="g",
-            decomposed=True,
-            checklist=[
-                ChecklistItem(text="a", status=ITEM_COMPLETED,
-                              added_by=ADDED_BY_JUDGE, evidence="done"),
-                ChecklistItem(text="b", status=ITEM_PENDING,
-                              added_by=ADDED_BY_USER),
-            ],
-        )
-        round_tripped = GoalState.from_json(state.to_json())
-        assert round_tripped.decomposed is True
-        assert len(round_tripped.checklist) == 2
-        assert round_tripped.checklist[0].text == "a"
-        assert round_tripped.checklist[0].status == ITEM_COMPLETED
-        assert round_tripped.checklist[0].evidence == "done"
-        assert round_tripped.checklist[1].added_by == ADDED_BY_USER
-
-    def test_checklist_counts_and_all_terminal(self):
-        from hermes_cli.goals import (
-            ChecklistItem, GoalState,
-            ITEM_COMPLETED, ITEM_IMPOSSIBLE, ITEM_PENDING,
-        )
-
-        state = GoalState(
-            goal="g",
-            checklist=[
-                ChecklistItem(text="a", status=ITEM_COMPLETED),
-                ChecklistItem(text="b", status=ITEM_IMPOSSIBLE),
-                ChecklistItem(text="c", status=ITEM_PENDING),
-            ],
-        )
-        total, done, imp, pending = state.checklist_counts()
-        assert (total, done, imp, pending) == (3, 1, 1, 1)
-        assert state.all_terminal() is False
-
-        state.checklist[2].status = ITEM_IMPOSSIBLE
-        assert state.all_terminal() is True
-
-    def test_empty_checklist_is_not_all_terminal(self):
-        """Empty list must NOT be considered done."""
-        from hermes_cli.goals import GoalState
-
-        state = GoalState(goal="g")
-        assert state.all_terminal() is False
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Phase A: decompose
-# ──────────────────────────────────────────────────────────────────────
-
-
-class TestPhaseADecompose:
-    def test_decompose_writes_checklist_and_marks_decomposed(self, hermes_home):
-        from hermes_cli import goals
-        from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_JUDGE
-
-        mgr = GoalManager(session_id="phase-a-sid-1")
-        mgr.set("build a website")
-
-        items = [{"text": "homepage exists"}, {"text": "is mobile-friendly"}]
-        with patch.object(goals, "decompose_goal", return_value=(items, None)):
-            d = mgr.evaluate_after_turn("(initial response)")
-
-        assert d["verdict"] == "decompose"
-        assert d["should_continue"] is True
-        # Phase A produces a continuation prompt that includes the checklist.
-        assert d["continuation_prompt"] is not None
-        assert "Checklist progress" in d["continuation_prompt"]
-        assert mgr.state.decomposed is True
-        assert len(mgr.state.checklist) == 2
-        assert mgr.state.checklist[0].text == "homepage exists"
-        assert mgr.state.checklist[0].status == ITEM_PENDING
-        assert mgr.state.checklist[0].added_by == ADDED_BY_JUDGE
-
-    def test_decompose_only_runs_once(self, hermes_home):
-        """Decomposed=True after first call. Subsequent calls go to Phase B."""
-        from hermes_cli import goals
-        from hermes_cli.goals import GoalManager
-
-        mgr = GoalManager(session_id="phase-a-sid-2")
-        mgr.set("g")
-
-        with patch.object(
-            goals, "decompose_goal", return_value=([{"text": "x"}], None)
-        ) as decompose_mock, patch.object(
-            goals, "evaluate_checklist",
-            return_value=({"updates": [], "new_items": [], "reason": "..."}, False),
-        ) as eval_mock:
-            mgr.evaluate_after_turn("turn 1")
-            mgr.evaluate_after_turn("turn 2")
-            mgr.evaluate_after_turn("turn 3")
-
-        assert decompose_mock.call_count == 1
-        assert eval_mock.call_count == 2
-
-    def test_decompose_failure_falls_back_to_freeform(self, hermes_home):
-        """If decompose returns no items, manager falls through to freeform judge."""
-        from hermes_cli import goals
-        from hermes_cli.goals import GoalManager
-
-        mgr = GoalManager(session_id="phase-a-sid-3")
-        mgr.set("g")
-
-        with patch.object(goals, "decompose_goal", return_value=([], "model error")), \
-             patch.object(goals, "judge_goal_freeform",
-                          return_value=("done", "shipped", False)):
-            d = mgr.evaluate_after_turn("done!")
-
-        assert d["verdict"] == "done"
-        assert mgr.state.decomposed is True
-        assert mgr.state.checklist == []
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Phase B: evaluate (checklist mode)
-# ──────────────────────────────────────────────────────────────────────
-
-
-class TestPhaseBChecklist:
-    def _make_decomposed_mgr(self, sid: str, items):
-        """Helper: skip Phase A, install a decomposed checklist directly."""
-        from hermes_cli.goals import (
-            GoalManager, ChecklistItem, ITEM_PENDING, ADDED_BY_JUDGE,
-        )
-        from hermes_cli import goals as _g
-        mgr = GoalManager(session_id=sid)
-        mgr.set("a goal")
-        mgr.state.decomposed = True
-        mgr.state.checklist = [
-            ChecklistItem(text=t, status=ITEM_PENDING, added_by=ADDED_BY_JUDGE)
-            for t in items
-        ]
-        _g.save_goal(sid, mgr.state)
-        return mgr
-
-    def test_judge_flips_pending_to_completed(self, hermes_home):
-        from hermes_cli import goals
-        from hermes_cli.goals import ITEM_COMPLETED, ITEM_PENDING
-
-        mgr = self._make_decomposed_mgr("phase-b-1", ["a", "b", "c"])
-        with patch.object(
-            goals, "evaluate_checklist",
-            return_value=(
-                {
-                    "updates": [
-                        {"index": 0, "status": "completed", "evidence": "done"},
-                        {"index": 1, "status": "completed", "evidence": "shipped"},
-                    ],
-                    "new_items": [],
-                    "reason": "made progress",
-                },
-                False,
-            ),
-        ):
-            d = mgr.evaluate_after_turn("agent did stuff")
-
-        assert d["verdict"] == "continue"
-        assert mgr.state.checklist[0].status == ITEM_COMPLETED
-        assert mgr.state.checklist[0].evidence == "done"
-        assert mgr.state.checklist[1].status == ITEM_COMPLETED
-        assert mgr.state.checklist[2].status == ITEM_PENDING
-
-    def test_goal_done_when_all_items_terminal(self, hermes_home):
-        from hermes_cli import goals
-
-        mgr = self._make_decomposed_mgr("phase-b-2", ["a", "b"])
-        with patch.object(
-            goals, "evaluate_checklist",
-            return_value=(
-                {
-                    "updates": [
-                        {"index": 0, "status": "completed", "evidence": "ok"},
-                        {"index": 1, "status": "impossible", "evidence": "blocked"},
-                    ],
-                    "new_items": [],
-                    "reason": "all done or blocked",
-                },
-                False,
-            ),
-        ):
-            d = mgr.evaluate_after_turn("response")
-
-        assert d["verdict"] == "done"
-        assert d["should_continue"] is False
-        assert mgr.state.status == "done"
-
-    def test_stickiness_judge_cannot_regress_completed(self, hermes_home):
-        """Once an item is completed, judge updates trying to flip it back are ignored."""
-        from hermes_cli import goals
-        from hermes_cli.goals import ITEM_COMPLETED
-
-        mgr = self._make_decomposed_mgr("phase-b-stick", ["a"])
-        # First turn completes item 0.
-        with patch.object(
-            goals, "evaluate_checklist",
-            return_value=(
-                {
-                    "updates": [{"index": 0, "status": "completed", "evidence": "yes"}],
-                    "new_items": [],
-                    "reason": "done",
-                },
-                False,
-            ),
-        ):
-            mgr.evaluate_after_turn("turn 1")
-        assert mgr.state.checklist[0].status == ITEM_COMPLETED
-        # Second turn: judge tries to send a non-terminal update.
-        # _parse_evaluate_response already filters non-terminal, but at the
-        # apply layer we also skip terminal items entirely. Smoke both.
-        with patch.object(
-            goals, "evaluate_checklist",
-            return_value=(
-                {
-                    "updates": [{"index": 0, "status": "impossible", "evidence": "regress"}],
-                    "new_items": [],
-                    "reason": "trying to regress",
-                },
-                False,
-            ),
-        ):
-            mgr.evaluate_after_turn("turn 2")
-        # Sticky: status stays completed, evidence unchanged.
-        assert mgr.state.checklist[0].status == ITEM_COMPLETED
-        assert mgr.state.checklist[0].evidence == "yes"
-
-    def test_judge_appends_new_items(self, hermes_home):
-        from hermes_cli import goals
-
-        mgr = self._make_decomposed_mgr("phase-b-new", ["a"])
-        with patch.object(
-            goals, "evaluate_checklist",
-            return_value=(
-                {
-                    "updates": [],
-                    "new_items": [{"text": "newly discovered"}, {"text": "also this"}],
-                    "reason": "found more work",
-                },
-                False,
-            ),
-        ):
-            mgr.evaluate_after_turn("response")
-        assert len(mgr.state.checklist) == 3
-        assert mgr.state.checklist[1].text == "newly discovered"
-        assert mgr.state.checklist[1].added_by == "judge"
-
-
-# ──────────────────────────────────────────────────────────────────────
-# /subgoal user controls
-# ──────────────────────────────────────────────────────────────────────
-
-
-class TestSubgoalUserControls:
-    def test_add_subgoal_appends_user_item(self, hermes_home):
-        from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_USER
-
-        mgr = GoalManager(session_id="user-sid-1")
-        mgr.set("g")
-        item = mgr.add_subgoal("user added")
-        assert item.text == "user added"
-        assert item.status == ITEM_PENDING
-        assert item.added_by == ADDED_BY_USER
-        assert len(mgr.state.checklist) == 1
-
-    def test_add_subgoal_requires_active_goal(self, hermes_home):
-        from hermes_cli.goals import GoalManager
-        mgr = GoalManager(session_id="user-sid-2")
-        with pytest.raises(RuntimeError):
-            mgr.add_subgoal("x")
-
-    def test_add_subgoal_rejects_empty_text(self, hermes_home):
-        from hermes_cli.goals import GoalManager
-        mgr = GoalManager(session_id="user-sid-3")
-        mgr.set("g")
-        with pytest.raises(ValueError):
-            mgr.add_subgoal("   ")
-
-    def test_mark_subgoal_uses_1_based_index(self, hermes_home):
-        from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_IMPOSSIBLE
-        mgr = GoalManager(session_id="user-sid-4")
-        mgr.set("g")
-        mgr.add_subgoal("a")
-        mgr.add_subgoal("b")
-        mgr.add_subgoal("c")
-        mgr.mark_subgoal(2, "completed")
-        mgr.mark_subgoal(3, "impossible")
-        assert mgr.state.checklist[0].status == "pending"
-        assert mgr.state.checklist[1].status == ITEM_COMPLETED
-        assert mgr.state.checklist[2].status == ITEM_IMPOSSIBLE
-
-    def test_mark_subgoal_rejects_invalid_index(self, hermes_home):
-        from hermes_cli.goals import GoalManager
-        mgr = GoalManager(session_id="user-sid-5")
-        mgr.set("g")
-        mgr.add_subgoal("a")
-        with pytest.raises(IndexError):
-            mgr.mark_subgoal(5, "completed")
-        with pytest.raises(IndexError):
-            mgr.mark_subgoal(0, "completed")
-
-    def test_user_can_revert_terminal_item(self, hermes_home):
-        """User mark_subgoal bypasses stickiness — only path to revert."""
-        from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_PENDING
-        mgr = GoalManager(session_id="user-sid-6")
-        mgr.set("g")
-        mgr.add_subgoal("a")
-        mgr.mark_subgoal(1, "completed")
-        assert mgr.state.checklist[0].status == ITEM_COMPLETED
-        mgr.mark_subgoal(1, "pending")
-        assert mgr.state.checklist[0].status == ITEM_PENDING
-
-    def test_remove_subgoal(self, hermes_home):
-        from hermes_cli.goals import GoalManager
-        mgr = GoalManager(session_id="user-sid-7")
-        mgr.set("g")
-        mgr.add_subgoal("a")
-        mgr.add_subgoal("b")
-        mgr.add_subgoal("c")
-        removed = mgr.remove_subgoal(2)
-        assert removed.text == "b"
-        assert [it.text for it in mgr.state.checklist] == ["a", "c"]
-
-    def test_clear_checklist_resets_decomposed(self, hermes_home):
-        from hermes_cli.goals import GoalManager
-        mgr = GoalManager(session_id="user-sid-8")
-        mgr.set("g")
-        mgr.state.decomposed = True
-        mgr.add_subgoal("a")
-        mgr.clear_checklist()
-        assert mgr.state.checklist == []
-        assert mgr.state.decomposed is False
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Conversation dump
-# ──────────────────────────────────────────────────────────────────────
-
-
-class TestConversationDump:
-    def test_dump_writes_messages_to_goals_dir(self, hermes_home):
-        from hermes_cli.goals import dump_conversation, conversation_dump_path
-
-        msgs = [
-            {"role": "user", "content": "hi"},
-            {"role": "assistant", "content": "hello"},
-        ]
-        path = dump_conversation("dump-sid-1", msgs)
-        assert path is not None
-        assert path.exists()
-        # Path is under <HERMES_HOME>/goals/<sid>.json
-        assert path.parent.name == "goals"
-        assert path.name == "dump-sid-1.json"
-
-        loaded = json.loads(path.read_text())
-        assert loaded == msgs
-
-        # conversation_dump_path returns the same path
-        assert conversation_dump_path("dump-sid-1") == path
-
-    def test_dump_handles_unsafe_session_id(self, hermes_home):
-        from hermes_cli.goals import dump_conversation
-
-        path = dump_conversation("evil/../../sid", [{"role": "user", "content": "x"}])
-        assert path is not None
-        # No traversal — slashes are normalized to underscores. (Periods are
-        # preserved because they're legitimate in filenames; the resulting
-        # name still cannot escape <HERMES_HOME>/goals/ since path
-        # separators are gone.)
-        assert "/" not in path.name
-        assert path.parent.name == "goals"
-        # Verify the resolved path stays under the goals dir.
-        from hermes_cli.goals import _goals_dump_dir
-        goals_dir = _goals_dump_dir().resolve()
-        assert str(path.resolve()).startswith(str(goals_dir))
-
-    def test_dump_skips_when_messages_empty(self, hermes_home):
-        from hermes_cli.goals import dump_conversation
-        assert dump_conversation("sid", []) is None
-        assert dump_conversation("", [{"role": "user", "content": "x"}]) is None
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Judge read_file tool: path restriction
-# ──────────────────────────────────────────────────────────────────────
-
-
-class TestJudgeReadFile:
-    def test_restricted_to_allowed_path(self, hermes_home, tmp_path):
-        from hermes_cli.goals import _judge_read_file
-
-        allowed = tmp_path / "allowed.json"
-        allowed.write_text("hello\nworld\n")
-
-        ok = _judge_read_file(str(allowed), allowed_path=allowed)
-        loaded = json.loads(ok)
-        assert loaded["content"].startswith("hello")
-
-        # Try to read a different file.
-        sneaky = tmp_path / "secret.txt"
-        sneaky.write_text("nope\n")
-        denied = _judge_read_file(str(sneaky), allowed_path=allowed)
-        loaded = json.loads(denied)
-        assert "error" in loaded
-        assert "restricted" in loaded["error"]
-
-    def test_pagination(self, hermes_home, tmp_path):
-        from hermes_cli.goals import _judge_read_file
-        f = tmp_path / "big.json"
-        f.write_text("\n".join(f"line-{i}" for i in range(50)) + "\n")
-
-        # offset=10, limit=5 should return lines 10..14.
-        result = json.loads(_judge_read_file(str(f), offset=10, limit=5, allowed_path=f))
-        assert result["returned"] == 5
-        assert "line-9" in result["content"]   # 1-based: line 10 == zero-indexed 9
-        assert result["next_offset"] == 15
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Index conversion: judge emits 1-based, apply layer uses 0-based
-# ──────────────────────────────────────────────────────────────────────
-
-
-class TestJudgeIndexConversion:
-    def test_parse_evaluate_converts_1based_to_0based(self):
-        """The judge sees the checklist with 1-based indices (rendered as
-        '1. [ ] foo, 2. [ ] bar'). It emits updates with those same indices.
-        ``_parse_evaluate_response`` must convert them to 0-based so the
-        apply layer can index ``state.checklist`` directly.
-        """
-        from hermes_cli.goals import _parse_evaluate_response
-
-        raw = '''
-        {"updates": [
-            {"index": 1, "status": "completed", "evidence": "first item"},
-            {"index": 3, "status": "impossible", "evidence": "third item"}
-        ],
-         "new_items": [],
-         "reason": "evaluated"}
-        '''
-        parsed, parse_failed = _parse_evaluate_response(raw)
-        assert parse_failed is False
-        # 1 → 0, 3 → 2
-        assert [u["index"] for u in parsed["updates"]] == [0, 2]
-        assert parsed["updates"][0]["evidence"] == "first item"
-        assert parsed["updates"][1]["status"] == "impossible"
-
-    def test_full_round_trip_judge_index_to_state(self, hermes_home):
-        """End-to-end: judge emits 1-based, parser converts, apply layer
-        flips the right items in state.checklist."""
-        from hermes_cli import goals
-        from hermes_cli.goals import (
-            GoalManager, ChecklistItem, ITEM_PENDING, ITEM_COMPLETED,
-            ADDED_BY_JUDGE,
-        )
-
-        mgr = GoalManager(session_id="idx-round-trip")
-        mgr.set("g")
-        mgr.state.decomposed = True
-        mgr.state.checklist = [
-            ChecklistItem(text="first", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
-            ChecklistItem(text="second", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
-            ChecklistItem(text="third", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
-        ]
-        goals.save_goal("idx-round-trip", mgr.state)
-
-        # Simulate the judge returning a raw-JSON Phase-B reply via the
-        # auxiliary client: the parser handles the 1-based → 0-based
-        # conversion so the apply layer flips item 1 (text="first").
-        class FakeMessage:
-            content = '''
-            {"updates": [{"index": 1, "status": "completed", "evidence": "first done"}],
-             "new_items": [],
-             "reason": "..."}
-            '''
-            tool_calls = None
-
-        class FakeChoice:
-            message = FakeMessage()
-
-        class FakeResponse:
-            choices = [FakeChoice()]
-
-        class FakeClient:
-            class chat:
-                class completions:
-                    @staticmethod
-                    def create(**kwargs):
-                        return FakeResponse()
-
-        with patch.object(goals, "_get_judge_client", return_value=(FakeClient, "fake-model")):
-            mgr.evaluate_after_turn("ran the script and item 1 is done")
-
-        # Item 1 (text="first") should now be completed.
-        assert mgr.state.checklist[0].text == "first"
-        assert mgr.state.checklist[0].status == ITEM_COMPLETED
-        assert mgr.state.checklist[0].evidence == "first done"
-        # Other items still pending.
-        assert mgr.state.checklist[1].status == ITEM_PENDING
-        assert mgr.state.checklist[2].status == ITEM_PENDING
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Compression session-rotation: goal must follow the new session_id
-# ──────────────────────────────────────────────────────────────────────
-
-
-class TestGoalSurvivesCompressionRotation:
-    def test_load_goal_after_session_id_rotates(self, hermes_home):
-        """When auto-compression rotates the session_id, the goal must be
-        readable from the new session_id (forwarded by run_agent's
-        _compress_context block).
-
-        We don't run the full _compress_context method here — it has
-        ~60 dependencies. Instead we mirror exactly what that block does
-        with state_meta and assert the goal manager picks it up.
-        """
-        from hermes_cli.goals import GoalManager
-        from hermes_state import SessionDB
-
-        # Create a goal under a parent session_id.
-        parent_sid = "parent-rotate-001"
-        mgr = GoalManager(session_id=parent_sid)
-        mgr.set("survive compression")
-        assert mgr.is_active()
-
-        # Simulate the run_agent._compress_context forwarding block:
-        # read goal:<old>, write goal:<new> on the same SessionDB instance.
-        db = SessionDB()
-        new_sid = "child-rotate-001"
-        blob = db.get_meta(f"goal:{parent_sid}")
-        assert blob, "goal must be in state_meta"
-        db.set_meta(f"goal:{new_sid}", blob)
-
-        # New GoalManager for the rotated session_id should load the same goal.
-        mgr2 = GoalManager(session_id=new_sid)
-        assert mgr2.is_active()
-        assert mgr2.state.goal == "survive compression"
-        # Counters/checklist preserved verbatim.
-        assert mgr2.state.turns_used == mgr.state.turns_used
-        assert mgr2.state.checklist == mgr.state.checklist
-
-    def test_no_forward_when_no_goal(self, hermes_home):
-        """Forwarding is a no-op when the parent session has no goal."""
-        from hermes_state import SessionDB
-        from hermes_cli.goals import load_goal
-
-        db = SessionDB()
-        # Parent has no goal at all.
-        assert db.get_meta("goal:parent-no-goal") is None
-        blob = db.get_meta("goal:parent-no-goal")
-        if blob:  # parity with production guard
-            db.set_meta("goal:child-no-goal", blob)
-
-        # Child should still have no goal.
-        assert load_goal("child-no-goal") is None
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Forced tool-call judge: submit_checklist (Phase A) + update_checklist (Phase B)
-# ──────────────────────────────────────────────────────────────────────
-
-
-class _FakeFn:
-    def __init__(self, name, args):
-        self.name = name
-        self.arguments = args if isinstance(args, str) else json.dumps(args)
-
-
-class _FakeToolCall:
-    def __init__(self, tc_id, name, args):
-        self.id = tc_id
-        self.type = "function"
-        self.function = _FakeFn(name, args)
-
-
-class _FakeMessage:
-    def __init__(self, *, content="", tool_calls=None):
-        self.content = content
-        self.tool_calls = tool_calls or []
-
-
-class _FakeChoice:
-    def __init__(self, message):
-        self.message = message
-
-
-class _FakeResponse:
-    def __init__(self, message):
-        self.choices = [_FakeChoice(message)]
-
-
-def _make_fake_client(scripted_messages):
-    """Return a fake client whose .chat.completions.create() returns the
-    next scripted message each call. Mutates the underlying list as a
-    queue so repeat calls advance.
-    """
-    class FakeClient:
-        class chat:
-            class completions:
-                _queue = list(scripted_messages)
-                _calls = []
-
-                @classmethod
-                def create(cls, **kwargs):
-                    cls._calls.append(kwargs)
-                    if not cls._queue:
-                        raise RuntimeError("scripted-message queue exhausted")
-                    return _FakeResponse(cls._queue.pop(0))
-
-    return FakeClient
-
-
-class TestPhaseAToolCall:
-    def test_decompose_via_submit_checklist_tool(self, hermes_home):
-        from hermes_cli import goals
-        from hermes_cli.goals import decompose_goal
-
-        msg = _FakeMessage(
-            tool_calls=[_FakeToolCall(
-                "tc-1", "submit_checklist",
-                {"items": [{"text": "first criterion"}, {"text": "second criterion"}]},
-            )],
-        )
-        client = _make_fake_client([msg])
-
-        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
-            items, err = decompose_goal("build a website")
-
-        assert err is None
-        assert [it["text"] for it in items] == ["first criterion", "second criterion"]
-        # Verify we forced the tool: tool_choice should target submit_checklist.
-        call = client.chat.completions._calls[0]
-        assert "tools" in call
-        assert call["tools"][0]["function"]["name"] == "submit_checklist"
-        # tool_choice should be either {"type":"function","function":{"name":"submit_checklist"}}
-        # or "required" / "auto" if a fallback was used; primary attempt forces it.
-        tc = call["tool_choice"]
-        assert (
-            (isinstance(tc, dict) and tc.get("function", {}).get("name") == "submit_checklist")
-            or tc == "required"
-            or tc == "auto"
-        )
-
-    def test_decompose_falls_back_to_json_content_when_no_tool_call(self, hermes_home):
-        """If a broken provider returns content instead of a tool call, the
-        backstop JSON parser still salvages a checklist."""
-        from hermes_cli import goals
-        from hermes_cli.goals import decompose_goal
-
-        msg = _FakeMessage(
-            content='{"checklist": [{"text": "salvaged"}]}',
-            tool_calls=[],
-        )
-        client = _make_fake_client([msg])
-
-        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
-            items, err = decompose_goal("g")
-
-        assert err is None
-        assert items == [{"text": "salvaged"}]
-
-    def test_decompose_returns_error_when_no_tool_and_no_json(self, hermes_home):
-        from hermes_cli import goals
-        from hermes_cli.goals import decompose_goal
-
-        msg = _FakeMessage(content="I think this should be done in stages.", tool_calls=[])
-        client = _make_fake_client([msg])
-
-        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
-            items, err = decompose_goal("g")
-
-        assert items == []
-        assert err and "submit_checklist" in err
-
-    def test_decompose_drops_empty_text_items(self, hermes_home):
-        from hermes_cli import goals
-        from hermes_cli.goals import decompose_goal
-
-        msg = _FakeMessage(
-            tool_calls=[_FakeToolCall(
-                "tc-1", "submit_checklist",
-                {"items": [{"text": "ok"}, {"text": ""}, {"text": "  "}, {"text": "two"}]},
-            )],
-        )
-        client = _make_fake_client([msg])
-
-        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
-            items, err = decompose_goal("g")
-
-        assert err is None
-        assert [it["text"] for it in items] == ["ok", "two"]
-
-
-class TestPhaseBToolCall:
-    def test_evaluate_via_update_checklist_tool(self, hermes_home):
-        from hermes_cli import goals
-        from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
-
-        state = GoalState(
-            goal="g",
-            decomposed=True,
-            checklist=[
-                ChecklistItem(text="a", status=ITEM_PENDING),
-                ChecklistItem(text="b", status=ITEM_PENDING),
-            ],
-        )
-
-        msg = _FakeMessage(
-            tool_calls=[_FakeToolCall(
-                "tc-1", "update_checklist",
-                {
-                    # 1-based indices; layer converts to 0-based.
-                    "updates": [{"index": 1, "status": "completed", "evidence": "did a"}],
-                    "new_items": [{"text": "discovered c"}],
-                    "reason": "ran a",
-                },
-            )],
-        )
-        client = _make_fake_client([msg])
-
-        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
-            parsed, parse_failed = evaluate_checklist(
-                state, "did the first thing", history_path=None,
-            )
-
-        assert parse_failed is False
-        # Index converted 1 → 0
-        assert parsed["updates"] == [{"index": 0, "status": "completed", "evidence": "did a"}]
-        assert parsed["new_items"] == [{"text": "discovered c"}]
-        assert parsed["reason"] == "ran a"
-
-    def test_evaluate_does_read_file_then_update(self, hermes_home, tmp_path):
-        """Phase-B tool loop: judge calls read_file once, then update_checklist."""
-        from hermes_cli import goals
-        from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
-
-        # Make a real history file so the path-restriction check passes.
-        hist = tmp_path / "hist.json"
-        hist.write_text(json.dumps([{"role": "user", "content": "hi"}]))
-
-        state = GoalState(
-            goal="g",
-            decomposed=True,
-            checklist=[ChecklistItem(text="a", status=ITEM_PENDING)],
-        )
-
-        msg1 = _FakeMessage(tool_calls=[_FakeToolCall(
-            "tc-1", "read_file", {"path": str(hist), "offset": 1, "limit": 100},
-        )])
-        msg2 = _FakeMessage(tool_calls=[_FakeToolCall(
-            "tc-2", "update_checklist",
-            {
-                "updates": [{"index": 1, "status": "completed", "evidence": "saw it"}],
-                "new_items": [],
-                "reason": "verified via read_file",
-            },
-        )])
-        client = _make_fake_client([msg1, msg2])
-
-        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
-            parsed, parse_failed = evaluate_checklist(
-                state, "did the thing", history_path=hist,
-            )
-
-        assert parse_failed is False
-        assert parsed["updates"][0]["status"] == "completed"
-        assert parsed["reason"] == "verified via read_file"
-        # Two API calls — one for the read, one for the verdict.
-        assert len(client.chat.completions._calls) == 2
-
-    def test_evaluate_filters_non_terminal_status_in_tool_args(self, hermes_home):
-        """update_checklist should only accept 'completed' or 'impossible' —
-        any 'pending' updates are dropped at the normalize layer."""
-        from hermes_cli import goals
-        from hermes_cli.goals import evaluate_checklist, GoalState, ChecklistItem, ITEM_PENDING
-
-        state = GoalState(
-            goal="g",
-            decomposed=True,
-            checklist=[
-                ChecklistItem(text="a", status=ITEM_PENDING),
-                ChecklistItem(text="b", status=ITEM_PENDING),
-            ],
-        )
-        msg = _FakeMessage(tool_calls=[_FakeToolCall(
-            "tc-1", "update_checklist",
-            {
-                "updates": [
-                    {"index": 1, "status": "completed", "evidence": "yes"},
-                    {"index": 2, "status": "pending", "evidence": "skip me"},
-                ],
-                "new_items": [],
-                "reason": "...",
-            },
-        )])
-        client = _make_fake_client([msg])
-
-        with patch.object(goals, "_get_judge_client", return_value=(client, "fake-model")):
-            parsed, _pf = evaluate_checklist(state, "x", history_path=None)
-
-        # Only the completed flip survives; pending update is dropped silently.
-        assert len(parsed["updates"]) == 1
-        assert parsed["updates"][0]["index"] == 0
diff --git a/tui_gateway/server.py b/tui_gateway/server.py
index 07febffaf98..73b2fc1a811 100644
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@@ -3251,7 +3251,6 @@ def _run_prompt_submit(rid, sid: str, session: dict, text: Any) -> None:
                             decision = goal_mgr.evaluate_after_turn(
                                 raw,
                                 user_initiated=True,
-                                messages=list(session.get("history") or []),
                             )
                             verdict_msg = decision.get("message") or ""
                             if verdict_msg: