diff --git a/gateway/slash_commands.py b/gateway/slash_commands.py index 621492da95c..f35682f8603 100644 --- a/gateway/slash_commands.py +++ b/gateway/slash_commands.py @@ -1777,6 +1777,10 @@ class GatewaySlashCommandsMixin: if not args or lower == "status": return mgr.status_line() + # /goal show → print the active goal's completion contract + if lower == "show": + return f"{mgr.status_line()}\n{mgr.render_contract()}" + if lower == "pause": state = mgr.pause(reason="user-paused") if state is None: @@ -1832,9 +1836,38 @@ class GatewaySlashCommandsMixin: return "▶ Wait barrier cleared — goal loop resumes." return "No wait barrier set." + # /goal draft → draft a structured completion contract, + # then set it. The aux LLM call is sync; run it off the event loop. + draft_contract_obj = None + if lower.startswith("draft"): + objective = args[len("draft"):].strip() + if not objective: + return "Usage: /goal draft " + try: + import asyncio + from hermes_cli.goals import draft_contract + + draft_contract_obj = await asyncio.get_running_loop().run_in_executor( + None, draft_contract, objective + ) + except Exception as exc: + logger.debug("goal draft failed: %s", exc) + draft_contract_obj = None + args = objective # the goal text is the objective + contract = draft_contract_obj + else: + # Inline `field: value` lines parse into a completion contract; + # the remaining prose is the goal headline. Plain free-form goals + # (no such lines) behave exactly as before. + from hermes_cli.goals import parse_contract + + headline, parsed = parse_contract(args) + args = headline or args + contract = parsed if not parsed.is_empty() else None + # Otherwise — treat the remaining text as the new goal. try: - state = mgr.set(args) + state = mgr.set(args, contract=contract) except ValueError as exc: return t("gateway.goal.invalid", error=str(exc)) @@ -1855,7 +1888,13 @@ class GatewaySlashCommandsMixin: except Exception as exc: logger.debug("goal kickoff enqueue failed: %s", exc) - return t("gateway.goal.set", budget=state.max_turns, goal=state.goal) + base = t("gateway.goal.set", budget=state.max_turns, goal=state.goal) + if state.has_contract(): + return f"{base}\nCompletion contract:\n{state.contract.render_block()}" + if lower.startswith("draft"): + # Drafting was requested but the aux model couldn't produce one. + return f"{base}\n(Couldn't draft a contract — running as a free-form goal.)" + return base async def _handle_subgoal_command(self, event: "MessageEvent") -> str: """Handle /subgoal for gateway platforms (mirror of CLI handler). diff --git a/hermes_cli/cli_commands_mixin.py b/hermes_cli/cli_commands_mixin.py index edd3f42542d..d8df27a5df4 100644 --- a/hermes_cli/cli_commands_mixin.py +++ b/hermes_cli/cli_commands_mixin.py @@ -1775,7 +1775,7 @@ class CLICommandsMixin: print() def _handle_goal_command(self, cmd: str) -> None: - """Dispatch /goal subcommands: set / status / pause / resume / clear.""" + """Dispatch /goal subcommands: set / draft / show / status / pause / resume / clear.""" from cli import _DIM, _RST, _cprint parts = (cmd or "").strip().split(None, 1) arg = parts[1].strip() if len(parts) > 1 else "" @@ -1792,6 +1792,25 @@ class CLICommandsMixin: _cprint(f" {mgr.status_line()}") return + # /goal show → print the active goal's completion contract + if lower == "show": + _cprint(f" {mgr.status_line()}") + _cprint(f" {mgr.render_contract()}") + return + + # /goal draft → expand plain text into a structured + # completion contract (outcome / verification / constraints / + # boundaries / stop_when) and set it as the active goal. Adapted + # from Codex's "let the agent draft the goal" guidance: the contract + # makes "done" evidence-based instead of a loose vibe check. + if lower.startswith("draft"): + objective = arg[len("draft"):].strip() + if not objective: + _cprint(" Usage: /goal draft ") + return + self._handle_goal_draft(objective) + return + if lower == "pause": state = mgr.pause(reason="user-paused") if state is None: @@ -1853,18 +1872,30 @@ class CLICommandsMixin: _cprint(f" {_DIM}No wait barrier set.{_RST}") return - # Otherwise treat the arg as the goal text. + # Otherwise treat the arg as the goal text. Inline `field: value` + # lines (verify:, constraints:, boundaries:, stop when:) are parsed + # into a completion contract; the remaining prose is the headline. + # A plain free-form goal with no such lines behaves exactly as before. + from hermes_cli.goals import parse_contract + + headline, contract = parse_contract(arg) + goal_text = headline or arg try: - state = mgr.set(arg) + state = mgr.set(goal_text, contract=contract if not contract.is_empty() else None) except ValueError as exc: _cprint(f" Invalid goal: {exc}") return _cprint(f" ⊙ Goal set ({state.max_turns}-turn budget): {state.goal}") + if state.has_contract(): + _cprint(f" {_DIM}Completion contract:{_RST}") + for line in state.contract.render_block().splitlines(): + _cprint(f" {line}") _cprint( - f" {_DIM}After each turn, a judge model will check if the goal is done. " + f" {_DIM}After each turn, a judge model checks if the goal is done" + f"{' against the contract above' if state.has_contract() else ''}. " f"Hermes keeps working until it is, you pause/clear it, or the budget is " - f"exhausted. Use /goal status, /goal pause, /goal resume, /goal clear.{_RST}" + f"exhausted. Use /goal status, /goal show, /goal pause, /goal resume, /goal clear.{_RST}" ) # Kick the loop off immediately so the user doesn't have to send a # separate message after setting the goal. @@ -1873,6 +1904,52 @@ class CLICommandsMixin: except Exception: pass + def _handle_goal_draft(self, objective: str) -> None: + """Draft a structured completion contract from a plain objective and + set it as the active goal. Falls back to a bare goal if the aux model + can't produce a contract.""" + from cli import _DIM, _RST, _cprint + from hermes_cli.goals import draft_contract + + mgr = self._get_goal_manager() + if mgr is None: + _cprint(f" {_DIM}Goals unavailable (no active session).{_RST}") + return + + _cprint(f" {_DIM}Drafting completion contract…{_RST}") + try: + contract = draft_contract(objective) + except Exception as exc: + import logging as _logging + _logging.getLogger(__name__).debug("goal draft failed: %s", exc) + contract = None + + try: + state = mgr.set(objective, contract=contract) + except ValueError as exc: + _cprint(f" Invalid goal: {exc}") + return + + _cprint(f" ⊙ Goal set ({state.max_turns}-turn budget): {state.goal}") + if state.has_contract(): + _cprint(f" {_DIM}Drafted completion contract:{_RST}") + for line in state.contract.render_block().splitlines(): + _cprint(f" {line}") + _cprint( + f" {_DIM}Tighten any field by re-setting the goal with inline " + f"lines (e.g. verify: ), then /goal resume. " + f"Use /goal show to review.{_RST}" + ) + else: + _cprint( + f" {_DIM}Couldn't draft a contract (aux model unavailable) — " + f"running as a free-form goal. The per-turn judge still applies.{_RST}" + ) + try: + self._pending_input.put(state.goal) + except Exception: + pass + def _handle_subgoal_command(self, cmd: str) -> None: """Dispatch /subgoal subcommands. diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index 59cb8aa3648..540b2865df3 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -108,7 +108,7 @@ COMMAND_REGISTRY: list[CommandDef] = [ CommandDef("steer", "Inject a message after the next tool call without interrupting", "Session", args_hint=""), CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session", - args_hint="[text | pause | resume | clear | status | wait | unwait]"), + args_hint="[text | draft | show | pause | resume | clear | status | wait | unwait]"), CommandDef("subgoal", "Add or manage extra criteria on the active goal", "Session", args_hint="[text | remove N | clear]"), CommandDef("status", "Show session, model, token, and context info", "Session"), diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py index d9ef82909d8..3a1e869308a 100644 --- a/hermes_cli/goals.py +++ b/hermes_cli/goals.py @@ -76,6 +76,23 @@ CONTINUATION_PROMPT_TEMPLATE = ( "If you are blocked and need input from the user, say so clearly and stop." ) +# Used when the goal carries a structured completion contract. The contract +# block tells the agent exactly what "done" means, how to prove it, what not +# to break, what's in scope, and when to stop and ask — so it targets the +# verification surface instead of declaring victory loosely. +CONTINUATION_PROMPT_WITH_CONTRACT_TEMPLATE = ( + "[Continuing toward your standing goal]\n" + "Goal: {goal}\n\n" + "Completion contract:\n" + "{contract_block}\n\n" + "Continue working toward the outcome above. Take the next concrete step. " + "Stay within the stated boundaries and do not violate the constraints. " + "Before claiming the goal is done, satisfy the Verification criterion and " + "show the concrete evidence (command output, file contents, test result). " + "If you hit the stated stop condition or are otherwise blocked and need " + "user input, say so clearly and stop." +) + # Used when the user has added one or more /subgoal criteria. Surfaced # to the agent verbatim so it sees what to target on the next turn, # and surfaced to the judge so the verdict considers them too. @@ -170,6 +187,199 @@ JUDGE_USER_PROMPT_WITH_SUBGOALS_TEMPLATE = ( ) +# Used when the goal carries a structured completion contract. The judge +# decides DONE strictly against the Verification criterion and refuses to +# accept completion when a constraint was violated. +JUDGE_USER_PROMPT_WITH_CONTRACT_TEMPLATE = ( + "Goal:\n{goal}\n\n" + "Completion contract (the authoritative definition of done):\n" + "{contract_block}\n\n" + "Agent's most recent response:\n{response}\n\n" + "{background_block}" + "Current time: {current_time}\n\n" + "Decision rules:\n" + "- The goal is DONE only when the Verification criterion is satisfied AND " + "the response shows concrete evidence of it (a command result, file " + "contents excerpt, test/benchmark output) — not a claim like 'done' or " + "'all tests pass' without evidence.\n" + "- If any stated Constraint was violated, the goal is NOT done — CONTINUE.\n" + "- If the response shows the agent is waiting on a listed background " + "process to satisfy the Verification criterion (e.g. CI is the " + "verification and it's still running), return WAIT on that process " + "instead of re-poking — re-poking now would be pure busy-work.\n" + "- If the response explains the work is blocked / unachievable / needs " + "user input (e.g. the stated Stop condition was hit), treat it as DONE " + "with the reason describing the block.\n" + "- Otherwise the goal is NOT done — CONTINUE.\n\n" + "Is the goal satisfied per its completion contract — done, continue, or wait?" +) + + +# System prompt for /goal draft — turns a plain-language objective into a +# structured completion contract the user can review before activating. +# Adapted from Codex's "let Codex draft the goal" guidance. +DRAFT_CONTRACT_SYSTEM_PROMPT = ( + "You turn a user's plain-language objective into a structured completion " + "contract for an autonomous coding agent. The contract has five fields:\n" + "- outcome: the single end state that must be true when done\n" + "- verification: the specific test / command / artifact that PROVES the " + "outcome (must be concrete and checkable)\n" + "- constraints: what must NOT change or regress\n" + "- boundaries: which files, dirs, tools, or systems are in scope\n" + "- stop_when: the condition under which the agent should stop and ask " + "for human input instead of pushing on\n\n" + "Infer sensible, specific values from the objective and any project " + "context implied by it. Prefer concrete verification (a named test " + "command, a build, a benchmark) over vague phrases. Keep each field to " + "one or two sentences. If a field genuinely cannot be inferred, use an " + "empty string for it.\n\n" + "Reply ONLY with a single JSON object on one line:\n" + '{"outcome": "...", "verification": "...", "constraints": "...", ' + '"boundaries": "...", "stop_when": "..."}' +) + + +# ────────────────────────────────────────────────────────────────────── +# Completion contract +# ────────────────────────────────────────────────────────────────────── + +# The five contract fields, in display order. Adapted from OpenAI Codex's +# "strong goal" guidance: a durable objective works best when it names what +# "done" means, how to prove it, what must not regress, what tools/paths are +# in bounds, and when to stop and ask. A bare free-form goal (no contract) +# stays fully supported — every field defaults empty and is simply omitted +# from the prompts when unset. +_CONTRACT_FIELDS = ("outcome", "verification", "constraints", "boundaries", "stop_when") + +# Human labels for rendering and for the inline `field: value` parser. +_CONTRACT_LABELS = { + "outcome": "Outcome", + "verification": "Verification", + "constraints": "Constraints", + "boundaries": "Boundaries", + "stop_when": "Stop when blocked", +} + +# Inline-input aliases the user may type before a value, mapped to the +# canonical field name. e.g. `verify: tests pass` or `done when: ...`. +_CONTRACT_ALIASES = { + "outcome": "outcome", + "goal": "outcome", + "done": "outcome", + "done when": "outcome", + "verification": "verification", + "verify": "verification", + "verified by": "verification", + "evidence": "verification", + "proof": "verification", + "constraints": "constraints", + "constraint": "constraints", + "preserve": "constraints", + "must not": "constraints", + "do not change": "constraints", + "boundaries": "boundaries", + "boundary": "boundaries", + "scope": "boundaries", + "allowed": "boundaries", + "files": "boundaries", + "stop when": "stop_when", + "stop_when": "stop_when", + "blocked": "stop_when", + "stop if blocked": "stop_when", + "give up when": "stop_when", +} + + +@dataclass +class GoalContract: + """Optional structured completion contract for a goal. + + Each field is free-form prose the user (or :func:`draft_contract`) + supplies. Empty fields are omitted everywhere — a goal with no contract + behaves exactly like the original free-form goal. The contract is woven + into both the continuation prompt (so the agent targets the verification + surface and respects constraints) and the judge prompt (so "done" is + decided against evidence, not vibes). + """ + + outcome: str = "" + verification: str = "" + constraints: str = "" + boundaries: str = "" + stop_when: str = "" + + def is_empty(self) -> bool: + return not any(getattr(self, f).strip() for f in _CONTRACT_FIELDS) + + def to_dict(self) -> Dict[str, str]: + return {f: getattr(self, f) for f in _CONTRACT_FIELDS} + + @classmethod + def from_dict(cls, data: Optional[Dict[str, Any]]) -> "GoalContract": + if not isinstance(data, dict): + return cls() + return cls(**{f: str(data.get(f) or "").strip() for f in _CONTRACT_FIELDS}) + + def render_block(self) -> str: + """Render non-empty contract fields as a labelled block. Empty + contract → empty string (callers skip the section entirely).""" + lines = [] + for f in _CONTRACT_FIELDS: + val = getattr(self, f).strip() + if val: + lines.append(f"- {_CONTRACT_LABELS[f]}: {val}") + return "\n".join(lines) + + +def parse_contract(text: str) -> Tuple[str, GoalContract]: + """Split user-typed goal text into a headline + structured contract. + + Supports inline ``field: value`` lines so power users can type a full + contract in one shot, e.g.:: + + Migrate auth to JWT + verify: the auth test suite passes + constraints: keep the public /login response shape unchanged + boundaries: only touch services/auth and its tests + stop when: a schema change needs product sign-off + + The first non-field line(s) become the goal headline; recognized + ``field:`` lines populate the contract. Lines for the same field are + joined. Unrecognized prefixes stay part of the headline, so a plain + free-form goal with an incidental colon (``Fix bug: the parser``) + is NOT mangled — only lines whose prefix matches a known alias are + pulled out. Returns ``(headline, contract)``. + """ + if not text: + return "", GoalContract() + + headline_parts: List[str] = [] + fields: Dict[str, List[str]] = {f: [] for f in _CONTRACT_FIELDS} + + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line: + continue + matched = False + if ":" in line: + prefix, _, value = line.partition(":") + key = _CONTRACT_ALIASES.get(prefix.strip().lower()) + if key is not None and value.strip(): + fields[key].append(value.strip()) + matched = True + if not matched: + headline_parts.append(line) + + headline = " ".join(headline_parts).strip() + contract = GoalContract( + **{f: " ".join(v).strip() for f, v in fields.items()} + ) + # If a headline was given but no explicit `outcome:` field, the headline + # IS the outcome — don't duplicate it into the contract block (the goal + # text already carries it), so leave outcome empty in that case. + return headline, contract + + # ────────────────────────────────────────────────────────────────────── # Dataclass # ────────────────────────────────────────────────────────────────────── @@ -219,9 +429,15 @@ class GoalState: waiting_until: float = 0.0 waiting_reason: Optional[str] = None waiting_since: float = 0.0 + # Optional structured completion contract (outcome / verification / + # constraints / boundaries / stop_when). Empty by default; a goal with + # no contract behaves exactly like the original free-form goal. + contract: GoalContract = field(default_factory=GoalContract) def to_json(self) -> str: - return json.dumps(asdict(self), ensure_ascii=False) + data = asdict(self) + # asdict already recursed GoalContract into a plain dict. + return json.dumps(data, ensure_ascii=False) @classmethod def from_json(cls, raw: str) -> "GoalState": @@ -247,8 +463,14 @@ class GoalState: waiting_until=float(data.get("waiting_until", 0.0) or 0.0), waiting_reason=data.get("waiting_reason"), waiting_since=float(data.get("waiting_since", 0.0) or 0.0), + contract=GoalContract.from_dict(data.get("contract")), ) + # --- contract helpers ------------------------------------------------- + + def has_contract(self) -> bool: + return self.contract is not None and not self.contract.is_empty() + # --- subgoals helpers ------------------------------------------------- def render_subgoals_block(self) -> str: @@ -618,6 +840,7 @@ def judge_goal( timeout: float = DEFAULT_JUDGE_TIMEOUT, subgoals: Optional[List[str]] = None, background_processes: Optional[List[Dict[str, Any]]] = None, + contract: Optional[GoalContract] = None, ) -> Tuple[str, str, bool, Optional[Dict[str, Any]]]: """Ask the auxiliary model whether the goal is satisfied. @@ -637,6 +860,12 @@ def judge_goal( live ``process_registry.list_sessions()`` snapshot; when the agent is waiting on one (a CI poller, build, etc.) the judge can return a ``wait`` verdict naming its pid, parking the loop instead of re-poking. + ``contract`` is an optional structured completion contract; when present + the judge decides DONE strictly against its Verification criterion and + refuses completion when a Constraint was violated. All three are additive + — a contract, subgoals, and a background-process list can coexist in one + judge prompt; when none are set, behavior is identical to the original + free-form judge. This is deliberately fail-open: any error returns ``("continue", ..., False, None)`` so a broken judge doesn't wedge progress — the turn budget and the @@ -663,11 +892,30 @@ def judge_goal( if client is None or not model: return "continue", "no auxiliary client configured", False, None - # Build the prompt — pick the with-subgoals variant when applicable. + # Build the prompt. Priority: contract > subgoals > plain. When both a + # contract and subgoals exist, the subgoals are appended into the + # contract block as extra criteria so the judge sees a single source of + # truth. clean_subgoals = [s.strip() for s in (subgoals or []) if s and s.strip()] background_block = _render_background_block(background_processes) current_time = datetime.now(tz=timezone.utc).astimezone().strftime("%Y-%m-%d %H:%M:%S %Z") - if clean_subgoals: + + if contract is not None and not contract.is_empty(): + contract_block = contract.render_block() + if clean_subgoals: + extra = "\n".join( + f"- Extra criterion {i}: {text}" + for i, text in enumerate(clean_subgoals, start=1) + ) + contract_block = f"{contract_block}\n{extra}" + prompt = JUDGE_USER_PROMPT_WITH_CONTRACT_TEMPLATE.format( + goal=_truncate(goal, 2000), + contract_block=_truncate(contract_block, 2500), + response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS), + background_block=background_block, + current_time=current_time, + ) + elif clean_subgoals: subgoals_block = "\n".join( f"- {i}. {text}" for i, text in enumerate(clean_subgoals, start=1) ) @@ -736,6 +984,91 @@ def gather_background_processes(task_id: Optional[str] = None) -> List[Dict[str, return [s for s in sessions if isinstance(s, dict) and s.get("status") != "exited"] +def draft_contract(objective: str, *, timeout: float = DEFAULT_JUDGE_TIMEOUT) -> Optional[GoalContract]: + """Expand a plain-language objective into a structured completion contract. + + Uses the ``goal_judge`` auxiliary task (main-model-first, cache-safe — it + is a side LLM call, not a conversation turn). Returns a populated + :class:`GoalContract` on success, or ``None`` when the auxiliary client is + unavailable or the model's reply can't be parsed. Callers fall back to a + bare free-form goal in that case, so a missing/weak aux model never blocks + setting a goal. + """ + objective = (objective or "").strip() + if not objective: + return None + + try: + from agent.auxiliary_client import get_auxiliary_extra_body, get_text_auxiliary_client + except Exception as exc: + logger.debug("goal draft: auxiliary client import failed: %s", exc) + return None + + try: + client, model = get_text_auxiliary_client("goal_judge") + except Exception as exc: + logger.debug("goal draft: get_text_auxiliary_client failed: %s", exc) + return None + + if client is None or not model: + return None + + try: + resp = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": DRAFT_CONTRACT_SYSTEM_PROMPT}, + {"role": "user", "content": f"Objective:\n{_truncate(objective, 4000)}"}, + ], + temperature=0, + max_tokens=_goal_judge_max_tokens(), + timeout=timeout, + extra_body=get_auxiliary_extra_body() or None, + ) + except Exception as exc: + logger.info("goal draft: API call failed (%s)", exc) + return None + + try: + raw = resp.choices[0].message.content or "" + except Exception: + raw = "" + + data = _extract_json_object(raw) + if not isinstance(data, dict): + logger.debug("goal draft: reply was not JSON: %r", _truncate(raw, 200)) + return None + contract = GoalContract.from_dict(data) + return None if contract.is_empty() else contract + + +def _extract_json_object(raw: str) -> Optional[Dict[str, Any]]: + """Best-effort: pull the first JSON object out of a model reply. + + Shares the fence-stripping + first-object fallback logic used by the + judge parser, but returns the dict (or None) rather than a verdict. + """ + if not raw: + return None + text = raw.strip() + if text.startswith("```"): + text = text.strip("`") + nl = text.find("\n") + if nl != -1: + text = text[nl + 1:] + try: + data = json.loads(text) + except Exception: + match = _JSON_OBJECT_RE.search(text) + if not match: + return None + try: + data = json.loads(match.group(0)) + except Exception: + return None + return data if isinstance(data, dict) else None + + # ────────────────────────────────────────────────────────────────────── # GoalManager — the orchestration surface CLI + gateway talk to # ────────────────────────────────────────────────────────────────────── @@ -775,34 +1108,39 @@ class GoalManager: def has_goal(self) -> bool: return self._state is not None and self._state.status in {"active", "paused"} + def has_contract(self) -> bool: + return self._state is not None and self._state.has_contract() + def status_line(self) -> str: s = self._state if s is None or s.status in {"cleared",}: return "No active goal. Set one with /goal ." turns = f"{s.turns_used}/{s.max_turns} turns" sub = f", {len(s.subgoals)} subgoal{'s' if len(s.subgoals) != 1 else ''}" if s.subgoals else "" + con = ", contract" if self.has_contract() else "" + meta = f"{turns}{sub}{con}" if s.status == "active": if s.waiting_on_session and _session_waiting(s.waiting_on_session): wr = s.waiting_reason or f"session {s.waiting_on_session}" - return f"⏳ Goal (parked on {wr}, {turns}{sub}): {s.goal}" + return f"⏳ Goal (parked on {wr}, {meta}): {s.goal}" if s.waiting_on_pid and _pid_alive(s.waiting_on_pid): wr = s.waiting_reason or f"pid {s.waiting_on_pid}" - return f"⏳ Goal (parked on {wr}, {turns}{sub}): {s.goal}" + return f"⏳ Goal (parked on {wr}, {meta}): {s.goal}" if s.waiting_until and time.time() < s.waiting_until: remaining = int(s.waiting_until - time.time()) wr = s.waiting_reason or f"{remaining}s" - return f"⏳ Goal (parked {remaining}s — {wr}, {turns}{sub}): {s.goal}" - return f"⊙ Goal (active, {turns}{sub}): {s.goal}" + return f"⏳ Goal (parked {remaining}s — {wr}, {meta}): {s.goal}" + return f"⊙ Goal (active, {meta}): {s.goal}" if s.status == "paused": extra = f" — {s.paused_reason}" if s.paused_reason else "" - return f"⏸ Goal (paused, {turns}{sub}{extra}): {s.goal}" + return f"⏸ Goal (paused, {meta}{extra}): {s.goal}" if s.status == "done": - return f"✓ Goal done ({turns}{sub}): {s.goal}" - return f"Goal ({s.status}, {turns}{sub}): {s.goal}" + return f"✓ Goal done ({meta}): {s.goal}" + return f"Goal ({s.status}, {meta}): {s.goal}" # --- mutation ----------------------------------------------------- - def set(self, goal: str, *, max_turns: Optional[int] = None) -> GoalState: + def set(self, goal: str, *, max_turns: Optional[int] = None, contract: Optional[GoalContract] = None) -> GoalState: goal = (goal or "").strip() if not goal: raise ValueError("goal text is empty") @@ -813,11 +1151,23 @@ class GoalManager: max_turns=int(max_turns) if max_turns else self.default_max_turns, created_at=time.time(), last_turn_at=0.0, + contract=contract if contract is not None else GoalContract(), ) self._state = state save_goal(self.session_id, state) return state + def set_contract(self, contract: GoalContract) -> Optional[GoalState]: + """Attach or replace the completion contract on the active goal. + + Returns the updated state, or None when there is no goal to attach to. + """ + if self._state is None: + return None + self._state.contract = contract or GoalContract() + save_goal(self.session_id, self._state) + return self._state + def pause(self, reason: str = "user-paused") -> Optional[GoalState]: if not self._state: return None @@ -1096,6 +1446,7 @@ class GoalManager: last_response, subgoals=state.subgoals or None, background_processes=background_processes, + contract=state.contract if state.has_contract() else None, ) state.last_verdict = verdict state.last_reason = reason @@ -1206,6 +1557,21 @@ class GoalManager: def next_continuation_prompt(self) -> Optional[str]: if not self._state or self._state.status != "active": return None + # Contract takes priority: it carries the verification surface and + # constraints the agent must target. Subgoals fold in as extra + # criteria appended to the contract block. + if self._state.has_contract(): + contract_block = self._state.contract.render_block() + if self._state.subgoals: + extra = "\n".join( + f"- Extra criterion {i}: {text}" + for i, text in enumerate(self._state.subgoals, start=1) + ) + contract_block = f"{contract_block}\n{extra}" + return CONTINUATION_PROMPT_WITH_CONTRACT_TEMPLATE.format( + goal=self._state.goal, + contract_block=contract_block, + ) if self._state.subgoals: return CONTINUATION_PROMPT_WITH_SUBGOALS_TEMPLATE.format( goal=self._state.goal, @@ -1213,6 +1579,14 @@ class GoalManager: ) return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal) + def render_contract(self) -> str: + """Public helper for the /goal show + /goal draft slash commands.""" + if self._state is None: + return "(no active goal)" + if not self._state.has_contract(): + return "(no completion contract — set one with /goal draft or inline field: value lines)" + return self._state.contract.render_block() + # ────────────────────────────────────────────────────────────────────── # Kanban worker goal loop @@ -1368,11 +1742,17 @@ def run_kanban_goal_loop( __all__ = [ "GoalState", + "GoalContract", "GoalManager", + "parse_contract", + "draft_contract", "CONTINUATION_PROMPT_TEMPLATE", "CONTINUATION_PROMPT_WITH_SUBGOALS_TEMPLATE", + "CONTINUATION_PROMPT_WITH_CONTRACT_TEMPLATE", "JUDGE_USER_PROMPT_TEMPLATE", "JUDGE_USER_PROMPT_WITH_SUBGOALS_TEMPLATE", + "JUDGE_USER_PROMPT_WITH_CONTRACT_TEMPLATE", + "DRAFT_CONTRACT_SYSTEM_PROMPT", "KANBAN_GOAL_CONTINUATION_TEMPLATE", "KANBAN_GOAL_FINALIZE_TEMPLATE", "DEFAULT_MAX_TURNS", diff --git a/tests/hermes_cli/test_goals.py b/tests/hermes_cli/test_goals.py index 2de73e29b9f..b6ae1abcda5 100644 --- a/tests/hermes_cli/test_goals.py +++ b/tests/hermes_cli/test_goals.py @@ -1219,3 +1219,350 @@ class TestSessionTriggerBarrier: "goal": "g", "status": "active", "turns_used": 0, "max_turns": 20, })) assert st.waiting_on_session is None + + +# ────────────────────────────────────────────────────────────────────── +# Completion contract (Codex-inspired structured goals) +# ────────────────────────────────────────────────────────────────────── + + +class TestParseContract: + def test_plain_goal_no_contract(self): + from hermes_cli.goals import parse_contract + + headline, contract = parse_contract("Migrate auth to JWT") + assert headline == "Migrate auth to JWT" + assert contract.is_empty() + + def test_incidental_colon_not_treated_as_field(self): + from hermes_cli.goals import parse_contract + + # "Fix bug:" — "fix bug" is not a known alias, so the whole line + # stays the headline and no contract field is populated. + headline, contract = parse_contract("Fix bug: the parser drops trailing commas") + assert headline == "Fix bug: the parser drops trailing commas" + assert contract.is_empty() + + def test_inline_fields_parsed(self): + from hermes_cli.goals import parse_contract + + text = ( + "Migrate auth to JWT\n" + "verify: the auth test suite passes\n" + "constraints: keep the /login response shape unchanged\n" + "boundaries: only touch services/auth and its tests\n" + "stop when: a schema change needs product sign-off" + ) + headline, contract = parse_contract(text) + assert headline == "Migrate auth to JWT" + assert contract.verification == "the auth test suite passes" + assert contract.constraints == "keep the /login response shape unchanged" + assert contract.boundaries == "only touch services/auth and its tests" + assert contract.stop_when == "a schema change needs product sign-off" + assert not contract.is_empty() + + def test_alias_variants(self): + from hermes_cli.goals import parse_contract + + _, c = parse_contract("Goal\nverified by: tests green\npreserve: public API") + assert c.verification == "tests green" + assert c.constraints == "public API" + + def test_multiple_lines_same_field_joined(self): + from hermes_cli.goals import parse_contract + + _, c = parse_contract("G\nconstraints: a\nconstraints: b") + assert c.constraints == "a b" + + +class TestGoalContractSerialization: + def test_roundtrip_with_contract(self): + from hermes_cli.goals import GoalState, GoalContract + + state = GoalState( + goal="ship it", + contract=GoalContract( + verification="pytest passes", + constraints="don't break the API", + ), + ) + restored = GoalState.from_json(state.to_json()) + assert restored.goal == "ship it" + assert restored.contract.verification == "pytest passes" + assert restored.contract.constraints == "don't break the API" + assert restored.has_contract() + + def test_old_row_without_contract_loads_clean(self): + # A state_meta row written before this feature has no "contract" key. + from hermes_cli.goals import GoalState + + legacy = '{"goal": "old goal", "status": "active", "turns_used": 2}' + state = GoalState.from_json(legacy) + assert state.goal == "old goal" + assert state.turns_used == 2 + assert state.contract.is_empty() + assert not state.has_contract() + + def test_render_block_omits_empty_fields(self): + from hermes_cli.goals import GoalContract + + block = GoalContract(outcome="X", verification="Y").render_block() + assert "Outcome: X" in block + assert "Verification: Y" in block + assert "Constraints" not in block + + +class TestGoalManagerContract: + def test_set_with_contract(self, hermes_home): + from hermes_cli.goals import GoalManager, GoalContract + + mgr = GoalManager(session_id="c-set") + mgr.set("ship it", contract=GoalContract(verification="tests pass")) + assert mgr.has_contract() + assert "contract" in mgr.status_line() + + def test_set_without_contract_no_marker(self, hermes_home): + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="c-none") + mgr.set("ship it") + assert not mgr.has_contract() + assert "contract" not in mgr.status_line() + + def test_continuation_prompt_includes_contract(self, hermes_home): + from hermes_cli.goals import GoalManager, GoalContract + + mgr = GoalManager(session_id="c-cont") + mgr.set("ship it", contract=GoalContract(verification="run pytest")) + prompt = mgr.next_continuation_prompt() + assert "Completion contract" in prompt + assert "run pytest" in prompt + assert "concrete evidence" in prompt + + def test_set_contract_after_the_fact(self, hermes_home): + from hermes_cli.goals import GoalManager, GoalContract + + mgr = GoalManager(session_id="c-after") + mgr.set("ship it") + assert not mgr.has_contract() + mgr.set_contract(GoalContract(verification="x")) + assert mgr.has_contract() + # Survives reload. + from hermes_cli.goals import GoalManager as GM2 + assert GM2(session_id="c-after").has_contract() + + def test_persistence_roundtrip(self, hermes_home): + from hermes_cli.goals import GoalManager, GoalContract + + GoalManager(session_id="c-persist").set( + "ship it", contract=GoalContract(outcome="O", verification="V") + ) + reloaded = GoalManager(session_id="c-persist") + assert reloaded.state.contract.outcome == "O" + assert reloaded.state.contract.verification == "V" + + +class TestJudgeWithContract: + def _fake_client(self, captured, content='{"done": false, "reason": "more"}'): + class _FakeMsg: + pass + _FakeMsg.content = content + class _FakeChoice: + message = _FakeMsg() + class _FakeResp: + choices = [_FakeChoice()] + class _FakeClient: + class chat: + class completions: + @staticmethod + def create(**kwargs): + captured.update(kwargs) + return _FakeResp() + return _FakeClient + + def test_judge_uses_contract_template(self, hermes_home): + from unittest.mock import patch + from hermes_cli import goals + from hermes_cli.goals import GoalContract + + captured = {} + client = self._fake_client(captured) + with patch("agent.auxiliary_client.get_text_auxiliary_client", + return_value=(client, "fake-model")), \ + patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None): + goals.judge_goal( + "ship it", "I think it's done", + contract=GoalContract(verification="pytest -q passes"), + ) + user_msg = next( + (m["content"] for m in (captured.get("messages") or []) if m["role"] == "user"), "" + ) + assert "completion contract" in user_msg.lower() + assert "pytest -q passes" in user_msg + assert "concrete evidence" in user_msg + + def test_contract_plus_subgoals_combine(self, hermes_home): + from unittest.mock import patch + from hermes_cli import goals + from hermes_cli.goals import GoalContract + + captured = {} + client = self._fake_client(captured) + with patch("agent.auxiliary_client.get_text_auxiliary_client", + return_value=(client, "fake-model")), \ + patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None): + goals.judge_goal( + "ship it", "done", + subgoals=["write changelog"], + contract=GoalContract(verification="pytest passes"), + ) + user_msg = next( + (m["content"] for m in (captured.get("messages") or []) if m["role"] == "user"), "" + ) + assert "pytest passes" in user_msg + assert "write changelog" in user_msg + + +class TestDraftContract: + def test_draft_parses_json(self, hermes_home): + from unittest.mock import patch + from hermes_cli import goals + + class _FakeMsg: + content = ( + '{"outcome": "auth on JWT", "verification": "auth suite green", ' + '"constraints": "no API change", "boundaries": "services/auth", ' + '"stop_when": "schema change needed"}' + ) + class _FakeChoice: + message = _FakeMsg() + class _FakeResp: + choices = [_FakeChoice()] + class _FakeClient: + class chat: + class completions: + @staticmethod + def create(**kwargs): + return _FakeResp() + + with patch("agent.auxiliary_client.get_text_auxiliary_client", + return_value=(_FakeClient, "fake-model")), \ + patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None): + contract = goals.draft_contract("Migrate auth to JWT") + assert contract is not None + assert contract.outcome == "auth on JWT" + assert contract.verification == "auth suite green" + assert not contract.is_empty() + + def test_draft_returns_none_on_bad_json(self, hermes_home): + from unittest.mock import patch + from hermes_cli import goals + + class _FakeMsg: + content = "I cannot produce JSON, sorry" + class _FakeChoice: + message = _FakeMsg() + class _FakeResp: + choices = [_FakeChoice()] + class _FakeClient: + class chat: + class completions: + @staticmethod + def create(**kwargs): + return _FakeResp() + + with patch("agent.auxiliary_client.get_text_auxiliary_client", + return_value=(_FakeClient, "fake-model")), \ + patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None): + assert goals.draft_contract("anything") is None + + def test_draft_returns_none_when_no_client(self, hermes_home): + from unittest.mock import patch + from hermes_cli import goals + + with patch("agent.auxiliary_client.get_text_auxiliary_client", + return_value=(None, None)): + assert goals.draft_contract("anything") is None + + +# ────────────────────────────────────────────────────────────────────── +# Compose: completion contract + wait barrier in one judge call +# ────────────────────────────────────────────────────────────────────── + + +class TestContractAndBackgroundCompose: + """A contract goal blocked on a background process must surface BOTH + the contract block and the background-process list to the judge, so it + can return either done (evidence met) or wait (parked on the poller).""" + + def _capture_client(self, captured, content='{"verdict": "wait", "wait_on_pid": 4242, "reason": "CI still running"}'): + class _FakeMsg: + pass + _FakeMsg.content = content + class _FakeChoice: + message = _FakeMsg() + class _FakeResp: + choices = [_FakeChoice()] + class _FakeClient: + class chat: + class completions: + @staticmethod + def create(**kwargs): + captured.update(kwargs) + return _FakeResp() + return _FakeClient + + def test_judge_prompt_carries_contract_and_background(self, hermes_home): + from unittest.mock import patch + from hermes_cli import goals + from hermes_cli.goals import GoalContract + + captured = {} + client = self._capture_client(captured) + bg = [{ + "session_id": "ci-watch", "pid": 4242, "status": "running", + "command": "wait_for_pr_green.sh 50501", "trigger": "exit", + }] + with patch("agent.auxiliary_client.get_text_auxiliary_client", + return_value=(client, "fake-model")), \ + patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None): + verdict, reason, parse_failed, wait_directive = goals.judge_goal( + "ship the PR", + "I pushed and started the CI watcher; waiting on it now.", + contract=GoalContract(verification="PR CI goes green"), + background_processes=bg, + ) + user_msg = next( + (m["content"] for m in (captured.get("messages") or []) if m["role"] == "user"), "" + ) + # Both surfaces present in one prompt. + assert "completion contract" in user_msg.lower() + assert "PR CI goes green" in user_msg + assert "Background processes" in user_msg + assert "4242" in user_msg + # The judge can return a wait verdict on a contract goal. + assert verdict == "wait" + assert wait_directive and wait_directive.get("pid") == 4242 + + def test_contract_goal_can_still_complete_on_evidence(self, hermes_home): + from unittest.mock import patch + from hermes_cli import goals + from hermes_cli.goals import GoalContract + + captured = {} + client = self._capture_client( + captured, + content='{"verdict": "done", "reason": "CI is green, evidence shown"}', + ) + bg = [{"session_id": "ci", "pid": 4242, "status": "running", "command": "ci", "trigger": "exit"}] + with patch("agent.auxiliary_client.get_text_auxiliary_client", + return_value=(client, "fake-model")), \ + patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None): + verdict, reason, parse_failed, wait_directive = goals.judge_goal( + "ship the PR", + "CI finished: 30 passed, 0 failed. Done.", + contract=GoalContract(verification="PR CI goes green"), + background_processes=bg, + ) + assert verdict == "done" + assert wait_directive is None diff --git a/website/docs/user-guide/features/goals.md b/website/docs/user-guide/features/goals.md index 8e1f4504e33..50b0a17e876 100644 --- a/website/docs/user-guide/features/goals.md +++ b/website/docs/user-guide/features/goals.md @@ -40,6 +40,8 @@ What you'll see: | Command | What it does | |---|---| | `/goal ` | Set (or replace) the standing goal. Kicks off the first turn immediately so you don't need to send a separate message. | +| `/goal draft ` | Draft a structured completion contract from a plain-language objective, then set it. See [Completion contracts](#completion-contracts). | +| `/goal show` | Print the active goal's completion contract. | | `/goal` or `/goal status` | Show the current goal, its status, and turns used. | | `/goal pause` | Stop the auto-continuation loop without clearing the goal. | | `/goal resume` | Resume the loop (resets the turn counter back to zero). | @@ -49,6 +51,46 @@ What you'll see: Works identically on the CLI and every gateway platform (Telegram, Discord, Slack, Matrix, Signal, WhatsApp, SMS, iMessage, Webhook, API server, and the web dashboard). +## Completion contracts + +A bare `/goal ` works fine, but a *vague* goal makes for vague judging — the judge can only check what you told it to want. Codex's `/goal` guidance makes the same point: a durable objective works best when it names **what done means, how to prove it, what not to break, what's in scope, and when to stop**. Hermes adapts this as an optional **completion contract** layered on top of the existing goal loop. + +A contract has five fields, all optional: + +| Field | Meaning | +|---|---| +| `outcome` | The single end state that must be true when done. | +| `verification` | The specific test / command / artifact that *proves* the outcome. | +| `constraints` | What must not change or regress. | +| `boundaries` | Which files, dirs, tools, or systems are in scope. | +| `stop_when` | The condition under which Hermes should stop and ask for input. | + +When a contract is set, both prompts change: the **continuation prompt** tells the agent to target the verification surface and respect the constraints, and the **judge prompt** decides `done` *only when the verification criterion is met with concrete evidence* (a command result, file excerpt, test output) — not a loose "looks done" claim. This directly tightens the most common `/goal` failure mode (premature completion or endless over-continuation on an underspecified objective). + +### Two ways to set a contract + +**1. Let Hermes draft it** (recommended — adapted from Codex's "let the agent draft the goal" tip): + +``` +/goal draft Migrate the auth service from session cookies to JWT +``` + +Hermes expands your one-liner into a full contract via the `goal_judge` auxiliary model, sets it, and shows you the result so you can review or tighten any field. If the aux model is unavailable, it falls back to a plain free-form goal — drafting never blocks setting a goal. + +**2. Write it inline** with `field: value` lines: + +``` +/goal Migrate auth to JWT +verify: pytest tests/auth passes +constraints: keep the /login response shape unchanged +boundaries: only touch services/auth and its tests +stop when: a DB schema migration is required +``` + +The first non-field line(s) are the goal headline; recognized field prefixes (`verify:`, `verified by:`, `constraints:`, `preserve:`, `boundaries:`, `scope:`, `stop when:`, `blocked:`, …) populate the contract. A plain goal with an incidental colon (`Fix bug: the parser drops commas`) is **not** mangled — only known field prefixes are pulled out. + +Use `/goal show` to review the active contract. Contracts persist in `SessionDB.state_meta` alongside the goal, so they survive `/resume`. Old goals from before this feature load unchanged (no contract). Contracts and `/subgoal` criteria compose: subgoals fold into the contract as extra criteria the judge must also satisfy. + ## Adding criteria mid-goal: `/subgoal` While a goal is active you can append extra acceptance criteria with `/subgoal ` without resetting the loop. Each call adds one numbered item to the goal's subgoal list; the **continuation prompt** the agent sees on the next turn includes the original goal plus an "Additional criteria the user added mid-loop" block, and the **judge prompt** is rewritten so the verdict must consider every subgoal — the goal isn't marked done until the original objective **and** every subgoal are met.