mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
feat(goals): completion contracts for /goal — evidence-based judging (#50501)
Adds an optional structured completion contract to the standing-goal loop, adapted from OpenAI Codex's /goal guidance (a durable objective works best when it names what done means, how to prove it, what not to break, what's in scope, and when to stop). A contract has five optional fields — outcome, verification, constraints, boundaries, stop_when. When set, the continuation prompt tells the agent to target the verification surface and respect constraints, and the judge marks the goal done only when the verification criterion is met with concrete evidence (command result, file excerpt, test output) instead of a loose "looks done" claim. This tightens the most common /goal failure mode: premature completion / endless over-continuation on an underspecified goal. Two ways to set a contract, both backward compatible (bare /goal <text> behaves exactly as before): - /goal draft <objective> — expands plain text into a full contract via the goal_judge aux model (cache-safe side call), falls back to a free-form goal if the model is unavailable. - /goal <text> with inline 'field: value' lines (verify:, constraints:, boundaries:, stop when:, ...). Plain goals with an incidental colon are not mangled — only known field prefixes are pulled out. - /goal show prints the active contract. Contracts persist in SessionDB.state_meta alongside the goal (survive /resume), compose with /subgoal criteria, and old goal rows load unchanged. CLI + every gateway platform via the shared GoalManager engine; zero new model tools. Tests: +18 in tests/hermes_cli/test_goals.py (parse/serialize/judge-prompt/ draft/fallback), 73/73 green; 42/42 across the broader goal test surface; live E2E roundtrip (set -> persist -> reload -> contract-aware prompts) green.
This commit is contained in:
parent
ff08e60c63
commit
2ba1cfeb2e
6 changed files with 904 additions and 19 deletions
|
|
@ -1777,6 +1777,10 @@ class GatewaySlashCommandsMixin:
|
|||
if not args or lower == "status":
|
||||
return mgr.status_line()
|
||||
|
||||
# /goal show → print the active goal's completion contract
|
||||
if lower == "show":
|
||||
return f"{mgr.status_line()}\n{mgr.render_contract()}"
|
||||
|
||||
if lower == "pause":
|
||||
state = mgr.pause(reason="user-paused")
|
||||
if state is None:
|
||||
|
|
@ -1832,9 +1836,38 @@ class GatewaySlashCommandsMixin:
|
|||
return "▶ Wait barrier cleared — goal loop resumes."
|
||||
return "No wait barrier set."
|
||||
|
||||
# /goal draft <objective> → draft a structured completion contract,
|
||||
# then set it. The aux LLM call is sync; run it off the event loop.
|
||||
draft_contract_obj = None
|
||||
if lower.startswith("draft"):
|
||||
objective = args[len("draft"):].strip()
|
||||
if not objective:
|
||||
return "Usage: /goal draft <objective in plain language>"
|
||||
try:
|
||||
import asyncio
|
||||
from hermes_cli.goals import draft_contract
|
||||
|
||||
draft_contract_obj = await asyncio.get_running_loop().run_in_executor(
|
||||
None, draft_contract, objective
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("goal draft failed: %s", exc)
|
||||
draft_contract_obj = None
|
||||
args = objective # the goal text is the objective
|
||||
contract = draft_contract_obj
|
||||
else:
|
||||
# Inline `field: value` lines parse into a completion contract;
|
||||
# the remaining prose is the goal headline. Plain free-form goals
|
||||
# (no such lines) behave exactly as before.
|
||||
from hermes_cli.goals import parse_contract
|
||||
|
||||
headline, parsed = parse_contract(args)
|
||||
args = headline or args
|
||||
contract = parsed if not parsed.is_empty() else None
|
||||
|
||||
# Otherwise — treat the remaining text as the new goal.
|
||||
try:
|
||||
state = mgr.set(args)
|
||||
state = mgr.set(args, contract=contract)
|
||||
except ValueError as exc:
|
||||
return t("gateway.goal.invalid", error=str(exc))
|
||||
|
||||
|
|
@ -1855,7 +1888,13 @@ class GatewaySlashCommandsMixin:
|
|||
except Exception as exc:
|
||||
logger.debug("goal kickoff enqueue failed: %s", exc)
|
||||
|
||||
return t("gateway.goal.set", budget=state.max_turns, goal=state.goal)
|
||||
base = t("gateway.goal.set", budget=state.max_turns, goal=state.goal)
|
||||
if state.has_contract():
|
||||
return f"{base}\nCompletion contract:\n{state.contract.render_block()}"
|
||||
if lower.startswith("draft"):
|
||||
# Drafting was requested but the aux model couldn't produce one.
|
||||
return f"{base}\n(Couldn't draft a contract — running as a free-form goal.)"
|
||||
return base
|
||||
|
||||
async def _handle_subgoal_command(self, event: "MessageEvent") -> str:
|
||||
"""Handle /subgoal for gateway platforms (mirror of CLI handler).
|
||||
|
|
|
|||
|
|
@ -1775,7 +1775,7 @@ class CLICommandsMixin:
|
|||
print()
|
||||
|
||||
def _handle_goal_command(self, cmd: str) -> None:
|
||||
"""Dispatch /goal subcommands: set / status / pause / resume / clear."""
|
||||
"""Dispatch /goal subcommands: set / draft / show / status / pause / resume / clear."""
|
||||
from cli import _DIM, _RST, _cprint
|
||||
parts = (cmd or "").strip().split(None, 1)
|
||||
arg = parts[1].strip() if len(parts) > 1 else ""
|
||||
|
|
@ -1792,6 +1792,25 @@ class CLICommandsMixin:
|
|||
_cprint(f" {mgr.status_line()}")
|
||||
return
|
||||
|
||||
# /goal show → print the active goal's completion contract
|
||||
if lower == "show":
|
||||
_cprint(f" {mgr.status_line()}")
|
||||
_cprint(f" {mgr.render_contract()}")
|
||||
return
|
||||
|
||||
# /goal draft <objective> → expand plain text into a structured
|
||||
# completion contract (outcome / verification / constraints /
|
||||
# boundaries / stop_when) and set it as the active goal. Adapted
|
||||
# from Codex's "let the agent draft the goal" guidance: the contract
|
||||
# makes "done" evidence-based instead of a loose vibe check.
|
||||
if lower.startswith("draft"):
|
||||
objective = arg[len("draft"):].strip()
|
||||
if not objective:
|
||||
_cprint(" Usage: /goal draft <objective in plain language>")
|
||||
return
|
||||
self._handle_goal_draft(objective)
|
||||
return
|
||||
|
||||
if lower == "pause":
|
||||
state = mgr.pause(reason="user-paused")
|
||||
if state is None:
|
||||
|
|
@ -1853,18 +1872,30 @@ class CLICommandsMixin:
|
|||
_cprint(f" {_DIM}No wait barrier set.{_RST}")
|
||||
return
|
||||
|
||||
# Otherwise treat the arg as the goal text.
|
||||
# Otherwise treat the arg as the goal text. Inline `field: value`
|
||||
# lines (verify:, constraints:, boundaries:, stop when:) are parsed
|
||||
# into a completion contract; the remaining prose is the headline.
|
||||
# A plain free-form goal with no such lines behaves exactly as before.
|
||||
from hermes_cli.goals import parse_contract
|
||||
|
||||
headline, contract = parse_contract(arg)
|
||||
goal_text = headline or arg
|
||||
try:
|
||||
state = mgr.set(arg)
|
||||
state = mgr.set(goal_text, contract=contract if not contract.is_empty() else None)
|
||||
except ValueError as exc:
|
||||
_cprint(f" Invalid goal: {exc}")
|
||||
return
|
||||
|
||||
_cprint(f" ⊙ Goal set ({state.max_turns}-turn budget): {state.goal}")
|
||||
if state.has_contract():
|
||||
_cprint(f" {_DIM}Completion contract:{_RST}")
|
||||
for line in state.contract.render_block().splitlines():
|
||||
_cprint(f" {line}")
|
||||
_cprint(
|
||||
f" {_DIM}After each turn, a judge model will check if the goal is done. "
|
||||
f" {_DIM}After each turn, a judge model checks if the goal is done"
|
||||
f"{' against the contract above' if state.has_contract() else ''}. "
|
||||
f"Hermes keeps working until it is, you pause/clear it, or the budget is "
|
||||
f"exhausted. Use /goal status, /goal pause, /goal resume, /goal clear.{_RST}"
|
||||
f"exhausted. Use /goal status, /goal show, /goal pause, /goal resume, /goal clear.{_RST}"
|
||||
)
|
||||
# Kick the loop off immediately so the user doesn't have to send a
|
||||
# separate message after setting the goal.
|
||||
|
|
@ -1873,6 +1904,52 @@ class CLICommandsMixin:
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
def _handle_goal_draft(self, objective: str) -> None:
|
||||
"""Draft a structured completion contract from a plain objective and
|
||||
set it as the active goal. Falls back to a bare goal if the aux model
|
||||
can't produce a contract."""
|
||||
from cli import _DIM, _RST, _cprint
|
||||
from hermes_cli.goals import draft_contract
|
||||
|
||||
mgr = self._get_goal_manager()
|
||||
if mgr is None:
|
||||
_cprint(f" {_DIM}Goals unavailable (no active session).{_RST}")
|
||||
return
|
||||
|
||||
_cprint(f" {_DIM}Drafting completion contract…{_RST}")
|
||||
try:
|
||||
contract = draft_contract(objective)
|
||||
except Exception as exc:
|
||||
import logging as _logging
|
||||
_logging.getLogger(__name__).debug("goal draft failed: %s", exc)
|
||||
contract = None
|
||||
|
||||
try:
|
||||
state = mgr.set(objective, contract=contract)
|
||||
except ValueError as exc:
|
||||
_cprint(f" Invalid goal: {exc}")
|
||||
return
|
||||
|
||||
_cprint(f" ⊙ Goal set ({state.max_turns}-turn budget): {state.goal}")
|
||||
if state.has_contract():
|
||||
_cprint(f" {_DIM}Drafted completion contract:{_RST}")
|
||||
for line in state.contract.render_block().splitlines():
|
||||
_cprint(f" {line}")
|
||||
_cprint(
|
||||
f" {_DIM}Tighten any field by re-setting the goal with inline "
|
||||
f"lines (e.g. verify: <command>), then /goal resume. "
|
||||
f"Use /goal show to review.{_RST}"
|
||||
)
|
||||
else:
|
||||
_cprint(
|
||||
f" {_DIM}Couldn't draft a contract (aux model unavailable) — "
|
||||
f"running as a free-form goal. The per-turn judge still applies.{_RST}"
|
||||
)
|
||||
try:
|
||||
self._pending_input.put(state.goal)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _handle_subgoal_command(self, cmd: str) -> None:
|
||||
"""Dispatch /subgoal subcommands.
|
||||
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
|||
CommandDef("steer", "Inject a message after the next tool call without interrupting", "Session",
|
||||
args_hint="<prompt>"),
|
||||
CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session",
|
||||
args_hint="[text | pause | resume | clear | status | wait <pid> | unwait]"),
|
||||
args_hint="[text | draft <text> | show | pause | resume | clear | status | wait <pid> | unwait]"),
|
||||
CommandDef("subgoal", "Add or manage extra criteria on the active goal", "Session",
|
||||
args_hint="[text | remove N | clear]"),
|
||||
CommandDef("status", "Show session, model, token, and context info", "Session"),
|
||||
|
|
|
|||
|
|
@ -76,6 +76,23 @@ CONTINUATION_PROMPT_TEMPLATE = (
|
|||
"If you are blocked and need input from the user, say so clearly and stop."
|
||||
)
|
||||
|
||||
# Used when the goal carries a structured completion contract. The contract
|
||||
# block tells the agent exactly what "done" means, how to prove it, what not
|
||||
# to break, what's in scope, and when to stop and ask — so it targets the
|
||||
# verification surface instead of declaring victory loosely.
|
||||
CONTINUATION_PROMPT_WITH_CONTRACT_TEMPLATE = (
|
||||
"[Continuing toward your standing goal]\n"
|
||||
"Goal: {goal}\n\n"
|
||||
"Completion contract:\n"
|
||||
"{contract_block}\n\n"
|
||||
"Continue working toward the outcome above. Take the next concrete step. "
|
||||
"Stay within the stated boundaries and do not violate the constraints. "
|
||||
"Before claiming the goal is done, satisfy the Verification criterion and "
|
||||
"show the concrete evidence (command output, file contents, test result). "
|
||||
"If you hit the stated stop condition or are otherwise blocked and need "
|
||||
"user input, say so clearly and stop."
|
||||
)
|
||||
|
||||
# Used when the user has added one or more /subgoal criteria. Surfaced
|
||||
# to the agent verbatim so it sees what to target on the next turn,
|
||||
# and surfaced to the judge so the verdict considers them too.
|
||||
|
|
@ -170,6 +187,199 @@ JUDGE_USER_PROMPT_WITH_SUBGOALS_TEMPLATE = (
|
|||
)
|
||||
|
||||
|
||||
# Used when the goal carries a structured completion contract. The judge
|
||||
# decides DONE strictly against the Verification criterion and refuses to
|
||||
# accept completion when a constraint was violated.
|
||||
JUDGE_USER_PROMPT_WITH_CONTRACT_TEMPLATE = (
|
||||
"Goal:\n{goal}\n\n"
|
||||
"Completion contract (the authoritative definition of done):\n"
|
||||
"{contract_block}\n\n"
|
||||
"Agent's most recent response:\n{response}\n\n"
|
||||
"{background_block}"
|
||||
"Current time: {current_time}\n\n"
|
||||
"Decision rules:\n"
|
||||
"- The goal is DONE only when the Verification criterion is satisfied AND "
|
||||
"the response shows concrete evidence of it (a command result, file "
|
||||
"contents excerpt, test/benchmark output) — not a claim like 'done' or "
|
||||
"'all tests pass' without evidence.\n"
|
||||
"- If any stated Constraint was violated, the goal is NOT done — CONTINUE.\n"
|
||||
"- If the response shows the agent is waiting on a listed background "
|
||||
"process to satisfy the Verification criterion (e.g. CI is the "
|
||||
"verification and it's still running), return WAIT on that process "
|
||||
"instead of re-poking — re-poking now would be pure busy-work.\n"
|
||||
"- If the response explains the work is blocked / unachievable / needs "
|
||||
"user input (e.g. the stated Stop condition was hit), treat it as DONE "
|
||||
"with the reason describing the block.\n"
|
||||
"- Otherwise the goal is NOT done — CONTINUE.\n\n"
|
||||
"Is the goal satisfied per its completion contract — done, continue, or wait?"
|
||||
)
|
||||
|
||||
|
||||
# System prompt for /goal draft — turns a plain-language objective into a
|
||||
# structured completion contract the user can review before activating.
|
||||
# Adapted from Codex's "let Codex draft the goal" guidance.
|
||||
DRAFT_CONTRACT_SYSTEM_PROMPT = (
|
||||
"You turn a user's plain-language objective into a structured completion "
|
||||
"contract for an autonomous coding agent. The contract has five fields:\n"
|
||||
"- outcome: the single end state that must be true when done\n"
|
||||
"- verification: the specific test / command / artifact that PROVES the "
|
||||
"outcome (must be concrete and checkable)\n"
|
||||
"- constraints: what must NOT change or regress\n"
|
||||
"- boundaries: which files, dirs, tools, or systems are in scope\n"
|
||||
"- stop_when: the condition under which the agent should stop and ask "
|
||||
"for human input instead of pushing on\n\n"
|
||||
"Infer sensible, specific values from the objective and any project "
|
||||
"context implied by it. Prefer concrete verification (a named test "
|
||||
"command, a build, a benchmark) over vague phrases. Keep each field to "
|
||||
"one or two sentences. If a field genuinely cannot be inferred, use an "
|
||||
"empty string for it.\n\n"
|
||||
"Reply ONLY with a single JSON object on one line:\n"
|
||||
'{"outcome": "...", "verification": "...", "constraints": "...", '
|
||||
'"boundaries": "...", "stop_when": "..."}'
|
||||
)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Completion contract
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
# The five contract fields, in display order. Adapted from OpenAI Codex's
|
||||
# "strong goal" guidance: a durable objective works best when it names what
|
||||
# "done" means, how to prove it, what must not regress, what tools/paths are
|
||||
# in bounds, and when to stop and ask. A bare free-form goal (no contract)
|
||||
# stays fully supported — every field defaults empty and is simply omitted
|
||||
# from the prompts when unset.
|
||||
_CONTRACT_FIELDS = ("outcome", "verification", "constraints", "boundaries", "stop_when")
|
||||
|
||||
# Human labels for rendering and for the inline `field: value` parser.
|
||||
_CONTRACT_LABELS = {
|
||||
"outcome": "Outcome",
|
||||
"verification": "Verification",
|
||||
"constraints": "Constraints",
|
||||
"boundaries": "Boundaries",
|
||||
"stop_when": "Stop when blocked",
|
||||
}
|
||||
|
||||
# Inline-input aliases the user may type before a value, mapped to the
|
||||
# canonical field name. e.g. `verify: tests pass` or `done when: ...`.
|
||||
_CONTRACT_ALIASES = {
|
||||
"outcome": "outcome",
|
||||
"goal": "outcome",
|
||||
"done": "outcome",
|
||||
"done when": "outcome",
|
||||
"verification": "verification",
|
||||
"verify": "verification",
|
||||
"verified by": "verification",
|
||||
"evidence": "verification",
|
||||
"proof": "verification",
|
||||
"constraints": "constraints",
|
||||
"constraint": "constraints",
|
||||
"preserve": "constraints",
|
||||
"must not": "constraints",
|
||||
"do not change": "constraints",
|
||||
"boundaries": "boundaries",
|
||||
"boundary": "boundaries",
|
||||
"scope": "boundaries",
|
||||
"allowed": "boundaries",
|
||||
"files": "boundaries",
|
||||
"stop when": "stop_when",
|
||||
"stop_when": "stop_when",
|
||||
"blocked": "stop_when",
|
||||
"stop if blocked": "stop_when",
|
||||
"give up when": "stop_when",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class GoalContract:
|
||||
"""Optional structured completion contract for a goal.
|
||||
|
||||
Each field is free-form prose the user (or :func:`draft_contract`)
|
||||
supplies. Empty fields are omitted everywhere — a goal with no contract
|
||||
behaves exactly like the original free-form goal. The contract is woven
|
||||
into both the continuation prompt (so the agent targets the verification
|
||||
surface and respects constraints) and the judge prompt (so "done" is
|
||||
decided against evidence, not vibes).
|
||||
"""
|
||||
|
||||
outcome: str = ""
|
||||
verification: str = ""
|
||||
constraints: str = ""
|
||||
boundaries: str = ""
|
||||
stop_when: str = ""
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return not any(getattr(self, f).strip() for f in _CONTRACT_FIELDS)
|
||||
|
||||
def to_dict(self) -> Dict[str, str]:
|
||||
return {f: getattr(self, f) for f in _CONTRACT_FIELDS}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Optional[Dict[str, Any]]) -> "GoalContract":
|
||||
if not isinstance(data, dict):
|
||||
return cls()
|
||||
return cls(**{f: str(data.get(f) or "").strip() for f in _CONTRACT_FIELDS})
|
||||
|
||||
def render_block(self) -> str:
|
||||
"""Render non-empty contract fields as a labelled block. Empty
|
||||
contract → empty string (callers skip the section entirely)."""
|
||||
lines = []
|
||||
for f in _CONTRACT_FIELDS:
|
||||
val = getattr(self, f).strip()
|
||||
if val:
|
||||
lines.append(f"- {_CONTRACT_LABELS[f]}: {val}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def parse_contract(text: str) -> Tuple[str, GoalContract]:
|
||||
"""Split user-typed goal text into a headline + structured contract.
|
||||
|
||||
Supports inline ``field: value`` lines so power users can type a full
|
||||
contract in one shot, e.g.::
|
||||
|
||||
Migrate auth to JWT
|
||||
verify: the auth test suite passes
|
||||
constraints: keep the public /login response shape unchanged
|
||||
boundaries: only touch services/auth and its tests
|
||||
stop when: a schema change needs product sign-off
|
||||
|
||||
The first non-field line(s) become the goal headline; recognized
|
||||
``field:`` lines populate the contract. Lines for the same field are
|
||||
joined. Unrecognized prefixes stay part of the headline, so a plain
|
||||
free-form goal with an incidental colon (``Fix bug: the parser``)
|
||||
is NOT mangled — only lines whose prefix matches a known alias are
|
||||
pulled out. Returns ``(headline, contract)``.
|
||||
"""
|
||||
if not text:
|
||||
return "", GoalContract()
|
||||
|
||||
headline_parts: List[str] = []
|
||||
fields: Dict[str, List[str]] = {f: [] for f in _CONTRACT_FIELDS}
|
||||
|
||||
for raw_line in text.splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line:
|
||||
continue
|
||||
matched = False
|
||||
if ":" in line:
|
||||
prefix, _, value = line.partition(":")
|
||||
key = _CONTRACT_ALIASES.get(prefix.strip().lower())
|
||||
if key is not None and value.strip():
|
||||
fields[key].append(value.strip())
|
||||
matched = True
|
||||
if not matched:
|
||||
headline_parts.append(line)
|
||||
|
||||
headline = " ".join(headline_parts).strip()
|
||||
contract = GoalContract(
|
||||
**{f: " ".join(v).strip() for f, v in fields.items()}
|
||||
)
|
||||
# If a headline was given but no explicit `outcome:` field, the headline
|
||||
# IS the outcome — don't duplicate it into the contract block (the goal
|
||||
# text already carries it), so leave outcome empty in that case.
|
||||
return headline, contract
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Dataclass
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -219,9 +429,15 @@ class GoalState:
|
|||
waiting_until: float = 0.0
|
||||
waiting_reason: Optional[str] = None
|
||||
waiting_since: float = 0.0
|
||||
# Optional structured completion contract (outcome / verification /
|
||||
# constraints / boundaries / stop_when). Empty by default; a goal with
|
||||
# no contract behaves exactly like the original free-form goal.
|
||||
contract: GoalContract = field(default_factory=GoalContract)
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(asdict(self), ensure_ascii=False)
|
||||
data = asdict(self)
|
||||
# asdict already recursed GoalContract into a plain dict.
|
||||
return json.dumps(data, ensure_ascii=False)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, raw: str) -> "GoalState":
|
||||
|
|
@ -247,8 +463,14 @@ class GoalState:
|
|||
waiting_until=float(data.get("waiting_until", 0.0) or 0.0),
|
||||
waiting_reason=data.get("waiting_reason"),
|
||||
waiting_since=float(data.get("waiting_since", 0.0) or 0.0),
|
||||
contract=GoalContract.from_dict(data.get("contract")),
|
||||
)
|
||||
|
||||
# --- contract helpers -------------------------------------------------
|
||||
|
||||
def has_contract(self) -> bool:
|
||||
return self.contract is not None and not self.contract.is_empty()
|
||||
|
||||
# --- subgoals helpers -------------------------------------------------
|
||||
|
||||
def render_subgoals_block(self) -> str:
|
||||
|
|
@ -618,6 +840,7 @@ def judge_goal(
|
|||
timeout: float = DEFAULT_JUDGE_TIMEOUT,
|
||||
subgoals: Optional[List[str]] = None,
|
||||
background_processes: Optional[List[Dict[str, Any]]] = None,
|
||||
contract: Optional[GoalContract] = None,
|
||||
) -> Tuple[str, str, bool, Optional[Dict[str, Any]]]:
|
||||
"""Ask the auxiliary model whether the goal is satisfied.
|
||||
|
||||
|
|
@ -637,6 +860,12 @@ def judge_goal(
|
|||
live ``process_registry.list_sessions()`` snapshot; when the agent is
|
||||
waiting on one (a CI poller, build, etc.) the judge can return a ``wait``
|
||||
verdict naming its pid, parking the loop instead of re-poking.
|
||||
``contract`` is an optional structured completion contract; when present
|
||||
the judge decides DONE strictly against its Verification criterion and
|
||||
refuses completion when a Constraint was violated. All three are additive
|
||||
— a contract, subgoals, and a background-process list can coexist in one
|
||||
judge prompt; when none are set, behavior is identical to the original
|
||||
free-form judge.
|
||||
|
||||
This is deliberately fail-open: any error returns ``("continue", ..., False, None)``
|
||||
so a broken judge doesn't wedge progress — the turn budget and the
|
||||
|
|
@ -663,11 +892,30 @@ def judge_goal(
|
|||
if client is None or not model:
|
||||
return "continue", "no auxiliary client configured", False, None
|
||||
|
||||
# Build the prompt — pick the with-subgoals variant when applicable.
|
||||
# Build the prompt. Priority: contract > subgoals > plain. When both a
|
||||
# contract and subgoals exist, the subgoals are appended into the
|
||||
# contract block as extra criteria so the judge sees a single source of
|
||||
# truth.
|
||||
clean_subgoals = [s.strip() for s in (subgoals or []) if s and s.strip()]
|
||||
background_block = _render_background_block(background_processes)
|
||||
current_time = datetime.now(tz=timezone.utc).astimezone().strftime("%Y-%m-%d %H:%M:%S %Z")
|
||||
if clean_subgoals:
|
||||
|
||||
if contract is not None and not contract.is_empty():
|
||||
contract_block = contract.render_block()
|
||||
if clean_subgoals:
|
||||
extra = "\n".join(
|
||||
f"- Extra criterion {i}: {text}"
|
||||
for i, text in enumerate(clean_subgoals, start=1)
|
||||
)
|
||||
contract_block = f"{contract_block}\n{extra}"
|
||||
prompt = JUDGE_USER_PROMPT_WITH_CONTRACT_TEMPLATE.format(
|
||||
goal=_truncate(goal, 2000),
|
||||
contract_block=_truncate(contract_block, 2500),
|
||||
response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS),
|
||||
background_block=background_block,
|
||||
current_time=current_time,
|
||||
)
|
||||
elif clean_subgoals:
|
||||
subgoals_block = "\n".join(
|
||||
f"- {i}. {text}" for i, text in enumerate(clean_subgoals, start=1)
|
||||
)
|
||||
|
|
@ -736,6 +984,91 @@ def gather_background_processes(task_id: Optional[str] = None) -> List[Dict[str,
|
|||
return [s for s in sessions if isinstance(s, dict) and s.get("status") != "exited"]
|
||||
|
||||
|
||||
def draft_contract(objective: str, *, timeout: float = DEFAULT_JUDGE_TIMEOUT) -> Optional[GoalContract]:
|
||||
"""Expand a plain-language objective into a structured completion contract.
|
||||
|
||||
Uses the ``goal_judge`` auxiliary task (main-model-first, cache-safe — it
|
||||
is a side LLM call, not a conversation turn). Returns a populated
|
||||
:class:`GoalContract` on success, or ``None`` when the auxiliary client is
|
||||
unavailable or the model's reply can't be parsed. Callers fall back to a
|
||||
bare free-form goal in that case, so a missing/weak aux model never blocks
|
||||
setting a goal.
|
||||
"""
|
||||
objective = (objective or "").strip()
|
||||
if not objective:
|
||||
return None
|
||||
|
||||
try:
|
||||
from agent.auxiliary_client import get_auxiliary_extra_body, get_text_auxiliary_client
|
||||
except Exception as exc:
|
||||
logger.debug("goal draft: auxiliary client import failed: %s", exc)
|
||||
return None
|
||||
|
||||
try:
|
||||
client, model = get_text_auxiliary_client("goal_judge")
|
||||
except Exception as exc:
|
||||
logger.debug("goal draft: get_text_auxiliary_client failed: %s", exc)
|
||||
return None
|
||||
|
||||
if client is None or not model:
|
||||
return None
|
||||
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": DRAFT_CONTRACT_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"Objective:\n{_truncate(objective, 4000)}"},
|
||||
],
|
||||
temperature=0,
|
||||
max_tokens=_goal_judge_max_tokens(),
|
||||
timeout=timeout,
|
||||
extra_body=get_auxiliary_extra_body() or None,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.info("goal draft: API call failed (%s)", exc)
|
||||
return None
|
||||
|
||||
try:
|
||||
raw = resp.choices[0].message.content or ""
|
||||
except Exception:
|
||||
raw = ""
|
||||
|
||||
data = _extract_json_object(raw)
|
||||
if not isinstance(data, dict):
|
||||
logger.debug("goal draft: reply was not JSON: %r", _truncate(raw, 200))
|
||||
return None
|
||||
contract = GoalContract.from_dict(data)
|
||||
return None if contract.is_empty() else contract
|
||||
|
||||
|
||||
def _extract_json_object(raw: str) -> Optional[Dict[str, Any]]:
|
||||
"""Best-effort: pull the first JSON object out of a model reply.
|
||||
|
||||
Shares the fence-stripping + first-object fallback logic used by the
|
||||
judge parser, but returns the dict (or None) rather than a verdict.
|
||||
"""
|
||||
if not raw:
|
||||
return None
|
||||
text = raw.strip()
|
||||
if text.startswith("```"):
|
||||
text = text.strip("`")
|
||||
nl = text.find("\n")
|
||||
if nl != -1:
|
||||
text = text[nl + 1:]
|
||||
try:
|
||||
data = json.loads(text)
|
||||
except Exception:
|
||||
match = _JSON_OBJECT_RE.search(text)
|
||||
if not match:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(match.group(0))
|
||||
except Exception:
|
||||
return None
|
||||
return data if isinstance(data, dict) else None
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# GoalManager — the orchestration surface CLI + gateway talk to
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -775,34 +1108,39 @@ class GoalManager:
|
|||
def has_goal(self) -> bool:
|
||||
return self._state is not None and self._state.status in {"active", "paused"}
|
||||
|
||||
def has_contract(self) -> bool:
|
||||
return self._state is not None and self._state.has_contract()
|
||||
|
||||
def status_line(self) -> str:
|
||||
s = self._state
|
||||
if s is None or s.status in {"cleared",}:
|
||||
return "No active goal. Set one with /goal <text>."
|
||||
turns = f"{s.turns_used}/{s.max_turns} turns"
|
||||
sub = f", {len(s.subgoals)} subgoal{'s' if len(s.subgoals) != 1 else ''}" if s.subgoals else ""
|
||||
con = ", contract" if self.has_contract() else ""
|
||||
meta = f"{turns}{sub}{con}"
|
||||
if s.status == "active":
|
||||
if s.waiting_on_session and _session_waiting(s.waiting_on_session):
|
||||
wr = s.waiting_reason or f"session {s.waiting_on_session}"
|
||||
return f"⏳ Goal (parked on {wr}, {turns}{sub}): {s.goal}"
|
||||
return f"⏳ Goal (parked on {wr}, {meta}): {s.goal}"
|
||||
if s.waiting_on_pid and _pid_alive(s.waiting_on_pid):
|
||||
wr = s.waiting_reason or f"pid {s.waiting_on_pid}"
|
||||
return f"⏳ Goal (parked on {wr}, {turns}{sub}): {s.goal}"
|
||||
return f"⏳ Goal (parked on {wr}, {meta}): {s.goal}"
|
||||
if s.waiting_until and time.time() < s.waiting_until:
|
||||
remaining = int(s.waiting_until - time.time())
|
||||
wr = s.waiting_reason or f"{remaining}s"
|
||||
return f"⏳ Goal (parked {remaining}s — {wr}, {turns}{sub}): {s.goal}"
|
||||
return f"⊙ Goal (active, {turns}{sub}): {s.goal}"
|
||||
return f"⏳ Goal (parked {remaining}s — {wr}, {meta}): {s.goal}"
|
||||
return f"⊙ Goal (active, {meta}): {s.goal}"
|
||||
if s.status == "paused":
|
||||
extra = f" — {s.paused_reason}" if s.paused_reason else ""
|
||||
return f"⏸ Goal (paused, {turns}{sub}{extra}): {s.goal}"
|
||||
return f"⏸ Goal (paused, {meta}{extra}): {s.goal}"
|
||||
if s.status == "done":
|
||||
return f"✓ Goal done ({turns}{sub}): {s.goal}"
|
||||
return f"Goal ({s.status}, {turns}{sub}): {s.goal}"
|
||||
return f"✓ Goal done ({meta}): {s.goal}"
|
||||
return f"Goal ({s.status}, {meta}): {s.goal}"
|
||||
|
||||
# --- mutation -----------------------------------------------------
|
||||
|
||||
def set(self, goal: str, *, max_turns: Optional[int] = None) -> GoalState:
|
||||
def set(self, goal: str, *, max_turns: Optional[int] = None, contract: Optional[GoalContract] = None) -> GoalState:
|
||||
goal = (goal or "").strip()
|
||||
if not goal:
|
||||
raise ValueError("goal text is empty")
|
||||
|
|
@ -813,11 +1151,23 @@ class GoalManager:
|
|||
max_turns=int(max_turns) if max_turns else self.default_max_turns,
|
||||
created_at=time.time(),
|
||||
last_turn_at=0.0,
|
||||
contract=contract if contract is not None else GoalContract(),
|
||||
)
|
||||
self._state = state
|
||||
save_goal(self.session_id, state)
|
||||
return state
|
||||
|
||||
def set_contract(self, contract: GoalContract) -> Optional[GoalState]:
|
||||
"""Attach or replace the completion contract on the active goal.
|
||||
|
||||
Returns the updated state, or None when there is no goal to attach to.
|
||||
"""
|
||||
if self._state is None:
|
||||
return None
|
||||
self._state.contract = contract or GoalContract()
|
||||
save_goal(self.session_id, self._state)
|
||||
return self._state
|
||||
|
||||
def pause(self, reason: str = "user-paused") -> Optional[GoalState]:
|
||||
if not self._state:
|
||||
return None
|
||||
|
|
@ -1096,6 +1446,7 @@ class GoalManager:
|
|||
last_response,
|
||||
subgoals=state.subgoals or None,
|
||||
background_processes=background_processes,
|
||||
contract=state.contract if state.has_contract() else None,
|
||||
)
|
||||
state.last_verdict = verdict
|
||||
state.last_reason = reason
|
||||
|
|
@ -1206,6 +1557,21 @@ class GoalManager:
|
|||
def next_continuation_prompt(self) -> Optional[str]:
|
||||
if not self._state or self._state.status != "active":
|
||||
return None
|
||||
# Contract takes priority: it carries the verification surface and
|
||||
# constraints the agent must target. Subgoals fold in as extra
|
||||
# criteria appended to the contract block.
|
||||
if self._state.has_contract():
|
||||
contract_block = self._state.contract.render_block()
|
||||
if self._state.subgoals:
|
||||
extra = "\n".join(
|
||||
f"- Extra criterion {i}: {text}"
|
||||
for i, text in enumerate(self._state.subgoals, start=1)
|
||||
)
|
||||
contract_block = f"{contract_block}\n{extra}"
|
||||
return CONTINUATION_PROMPT_WITH_CONTRACT_TEMPLATE.format(
|
||||
goal=self._state.goal,
|
||||
contract_block=contract_block,
|
||||
)
|
||||
if self._state.subgoals:
|
||||
return CONTINUATION_PROMPT_WITH_SUBGOALS_TEMPLATE.format(
|
||||
goal=self._state.goal,
|
||||
|
|
@ -1213,6 +1579,14 @@ class GoalManager:
|
|||
)
|
||||
return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal)
|
||||
|
||||
def render_contract(self) -> str:
|
||||
"""Public helper for the /goal show + /goal draft slash commands."""
|
||||
if self._state is None:
|
||||
return "(no active goal)"
|
||||
if not self._state.has_contract():
|
||||
return "(no completion contract — set one with /goal draft <objective> or inline field: value lines)"
|
||||
return self._state.contract.render_block()
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Kanban worker goal loop
|
||||
|
|
@ -1368,11 +1742,17 @@ def run_kanban_goal_loop(
|
|||
|
||||
__all__ = [
|
||||
"GoalState",
|
||||
"GoalContract",
|
||||
"GoalManager",
|
||||
"parse_contract",
|
||||
"draft_contract",
|
||||
"CONTINUATION_PROMPT_TEMPLATE",
|
||||
"CONTINUATION_PROMPT_WITH_SUBGOALS_TEMPLATE",
|
||||
"CONTINUATION_PROMPT_WITH_CONTRACT_TEMPLATE",
|
||||
"JUDGE_USER_PROMPT_TEMPLATE",
|
||||
"JUDGE_USER_PROMPT_WITH_SUBGOALS_TEMPLATE",
|
||||
"JUDGE_USER_PROMPT_WITH_CONTRACT_TEMPLATE",
|
||||
"DRAFT_CONTRACT_SYSTEM_PROMPT",
|
||||
"KANBAN_GOAL_CONTINUATION_TEMPLATE",
|
||||
"KANBAN_GOAL_FINALIZE_TEMPLATE",
|
||||
"DEFAULT_MAX_TURNS",
|
||||
|
|
|
|||
|
|
@ -1219,3 +1219,350 @@ class TestSessionTriggerBarrier:
|
|||
"goal": "g", "status": "active", "turns_used": 0, "max_turns": 20,
|
||||
}))
|
||||
assert st.waiting_on_session is None
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Completion contract (Codex-inspired structured goals)
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestParseContract:
|
||||
def test_plain_goal_no_contract(self):
|
||||
from hermes_cli.goals import parse_contract
|
||||
|
||||
headline, contract = parse_contract("Migrate auth to JWT")
|
||||
assert headline == "Migrate auth to JWT"
|
||||
assert contract.is_empty()
|
||||
|
||||
def test_incidental_colon_not_treated_as_field(self):
|
||||
from hermes_cli.goals import parse_contract
|
||||
|
||||
# "Fix bug:" — "fix bug" is not a known alias, so the whole line
|
||||
# stays the headline and no contract field is populated.
|
||||
headline, contract = parse_contract("Fix bug: the parser drops trailing commas")
|
||||
assert headline == "Fix bug: the parser drops trailing commas"
|
||||
assert contract.is_empty()
|
||||
|
||||
def test_inline_fields_parsed(self):
|
||||
from hermes_cli.goals import parse_contract
|
||||
|
||||
text = (
|
||||
"Migrate auth to JWT\n"
|
||||
"verify: the auth test suite passes\n"
|
||||
"constraints: keep the /login response shape unchanged\n"
|
||||
"boundaries: only touch services/auth and its tests\n"
|
||||
"stop when: a schema change needs product sign-off"
|
||||
)
|
||||
headline, contract = parse_contract(text)
|
||||
assert headline == "Migrate auth to JWT"
|
||||
assert contract.verification == "the auth test suite passes"
|
||||
assert contract.constraints == "keep the /login response shape unchanged"
|
||||
assert contract.boundaries == "only touch services/auth and its tests"
|
||||
assert contract.stop_when == "a schema change needs product sign-off"
|
||||
assert not contract.is_empty()
|
||||
|
||||
def test_alias_variants(self):
|
||||
from hermes_cli.goals import parse_contract
|
||||
|
||||
_, c = parse_contract("Goal\nverified by: tests green\npreserve: public API")
|
||||
assert c.verification == "tests green"
|
||||
assert c.constraints == "public API"
|
||||
|
||||
def test_multiple_lines_same_field_joined(self):
|
||||
from hermes_cli.goals import parse_contract
|
||||
|
||||
_, c = parse_contract("G\nconstraints: a\nconstraints: b")
|
||||
assert c.constraints == "a b"
|
||||
|
||||
|
||||
class TestGoalContractSerialization:
|
||||
def test_roundtrip_with_contract(self):
|
||||
from hermes_cli.goals import GoalState, GoalContract
|
||||
|
||||
state = GoalState(
|
||||
goal="ship it",
|
||||
contract=GoalContract(
|
||||
verification="pytest passes",
|
||||
constraints="don't break the API",
|
||||
),
|
||||
)
|
||||
restored = GoalState.from_json(state.to_json())
|
||||
assert restored.goal == "ship it"
|
||||
assert restored.contract.verification == "pytest passes"
|
||||
assert restored.contract.constraints == "don't break the API"
|
||||
assert restored.has_contract()
|
||||
|
||||
def test_old_row_without_contract_loads_clean(self):
|
||||
# A state_meta row written before this feature has no "contract" key.
|
||||
from hermes_cli.goals import GoalState
|
||||
|
||||
legacy = '{"goal": "old goal", "status": "active", "turns_used": 2}'
|
||||
state = GoalState.from_json(legacy)
|
||||
assert state.goal == "old goal"
|
||||
assert state.turns_used == 2
|
||||
assert state.contract.is_empty()
|
||||
assert not state.has_contract()
|
||||
|
||||
def test_render_block_omits_empty_fields(self):
|
||||
from hermes_cli.goals import GoalContract
|
||||
|
||||
block = GoalContract(outcome="X", verification="Y").render_block()
|
||||
assert "Outcome: X" in block
|
||||
assert "Verification: Y" in block
|
||||
assert "Constraints" not in block
|
||||
|
||||
|
||||
class TestGoalManagerContract:
|
||||
def test_set_with_contract(self, hermes_home):
|
||||
from hermes_cli.goals import GoalManager, GoalContract
|
||||
|
||||
mgr = GoalManager(session_id="c-set")
|
||||
mgr.set("ship it", contract=GoalContract(verification="tests pass"))
|
||||
assert mgr.has_contract()
|
||||
assert "contract" in mgr.status_line()
|
||||
|
||||
def test_set_without_contract_no_marker(self, hermes_home):
|
||||
from hermes_cli.goals import GoalManager
|
||||
|
||||
mgr = GoalManager(session_id="c-none")
|
||||
mgr.set("ship it")
|
||||
assert not mgr.has_contract()
|
||||
assert "contract" not in mgr.status_line()
|
||||
|
||||
def test_continuation_prompt_includes_contract(self, hermes_home):
|
||||
from hermes_cli.goals import GoalManager, GoalContract
|
||||
|
||||
mgr = GoalManager(session_id="c-cont")
|
||||
mgr.set("ship it", contract=GoalContract(verification="run pytest"))
|
||||
prompt = mgr.next_continuation_prompt()
|
||||
assert "Completion contract" in prompt
|
||||
assert "run pytest" in prompt
|
||||
assert "concrete evidence" in prompt
|
||||
|
||||
def test_set_contract_after_the_fact(self, hermes_home):
|
||||
from hermes_cli.goals import GoalManager, GoalContract
|
||||
|
||||
mgr = GoalManager(session_id="c-after")
|
||||
mgr.set("ship it")
|
||||
assert not mgr.has_contract()
|
||||
mgr.set_contract(GoalContract(verification="x"))
|
||||
assert mgr.has_contract()
|
||||
# Survives reload.
|
||||
from hermes_cli.goals import GoalManager as GM2
|
||||
assert GM2(session_id="c-after").has_contract()
|
||||
|
||||
def test_persistence_roundtrip(self, hermes_home):
|
||||
from hermes_cli.goals import GoalManager, GoalContract
|
||||
|
||||
GoalManager(session_id="c-persist").set(
|
||||
"ship it", contract=GoalContract(outcome="O", verification="V")
|
||||
)
|
||||
reloaded = GoalManager(session_id="c-persist")
|
||||
assert reloaded.state.contract.outcome == "O"
|
||||
assert reloaded.state.contract.verification == "V"
|
||||
|
||||
|
||||
class TestJudgeWithContract:
|
||||
def _fake_client(self, captured, content='{"done": false, "reason": "more"}'):
|
||||
class _FakeMsg:
|
||||
pass
|
||||
_FakeMsg.content = content
|
||||
class _FakeChoice:
|
||||
message = _FakeMsg()
|
||||
class _FakeResp:
|
||||
choices = [_FakeChoice()]
|
||||
class _FakeClient:
|
||||
class chat:
|
||||
class completions:
|
||||
@staticmethod
|
||||
def create(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return _FakeResp()
|
||||
return _FakeClient
|
||||
|
||||
def test_judge_uses_contract_template(self, hermes_home):
|
||||
from unittest.mock import patch
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import GoalContract
|
||||
|
||||
captured = {}
|
||||
client = self._fake_client(captured)
|
||||
with patch("agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(client, "fake-model")), \
|
||||
patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None):
|
||||
goals.judge_goal(
|
||||
"ship it", "I think it's done",
|
||||
contract=GoalContract(verification="pytest -q passes"),
|
||||
)
|
||||
user_msg = next(
|
||||
(m["content"] for m in (captured.get("messages") or []) if m["role"] == "user"), ""
|
||||
)
|
||||
assert "completion contract" in user_msg.lower()
|
||||
assert "pytest -q passes" in user_msg
|
||||
assert "concrete evidence" in user_msg
|
||||
|
||||
def test_contract_plus_subgoals_combine(self, hermes_home):
|
||||
from unittest.mock import patch
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import GoalContract
|
||||
|
||||
captured = {}
|
||||
client = self._fake_client(captured)
|
||||
with patch("agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(client, "fake-model")), \
|
||||
patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None):
|
||||
goals.judge_goal(
|
||||
"ship it", "done",
|
||||
subgoals=["write changelog"],
|
||||
contract=GoalContract(verification="pytest passes"),
|
||||
)
|
||||
user_msg = next(
|
||||
(m["content"] for m in (captured.get("messages") or []) if m["role"] == "user"), ""
|
||||
)
|
||||
assert "pytest passes" in user_msg
|
||||
assert "write changelog" in user_msg
|
||||
|
||||
|
||||
class TestDraftContract:
|
||||
def test_draft_parses_json(self, hermes_home):
|
||||
from unittest.mock import patch
|
||||
from hermes_cli import goals
|
||||
|
||||
class _FakeMsg:
|
||||
content = (
|
||||
'{"outcome": "auth on JWT", "verification": "auth suite green", '
|
||||
'"constraints": "no API change", "boundaries": "services/auth", '
|
||||
'"stop_when": "schema change needed"}'
|
||||
)
|
||||
class _FakeChoice:
|
||||
message = _FakeMsg()
|
||||
class _FakeResp:
|
||||
choices = [_FakeChoice()]
|
||||
class _FakeClient:
|
||||
class chat:
|
||||
class completions:
|
||||
@staticmethod
|
||||
def create(**kwargs):
|
||||
return _FakeResp()
|
||||
|
||||
with patch("agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(_FakeClient, "fake-model")), \
|
||||
patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None):
|
||||
contract = goals.draft_contract("Migrate auth to JWT")
|
||||
assert contract is not None
|
||||
assert contract.outcome == "auth on JWT"
|
||||
assert contract.verification == "auth suite green"
|
||||
assert not contract.is_empty()
|
||||
|
||||
def test_draft_returns_none_on_bad_json(self, hermes_home):
|
||||
from unittest.mock import patch
|
||||
from hermes_cli import goals
|
||||
|
||||
class _FakeMsg:
|
||||
content = "I cannot produce JSON, sorry"
|
||||
class _FakeChoice:
|
||||
message = _FakeMsg()
|
||||
class _FakeResp:
|
||||
choices = [_FakeChoice()]
|
||||
class _FakeClient:
|
||||
class chat:
|
||||
class completions:
|
||||
@staticmethod
|
||||
def create(**kwargs):
|
||||
return _FakeResp()
|
||||
|
||||
with patch("agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(_FakeClient, "fake-model")), \
|
||||
patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None):
|
||||
assert goals.draft_contract("anything") is None
|
||||
|
||||
def test_draft_returns_none_when_no_client(self, hermes_home):
|
||||
from unittest.mock import patch
|
||||
from hermes_cli import goals
|
||||
|
||||
with patch("agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(None, None)):
|
||||
assert goals.draft_contract("anything") is None
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Compose: completion contract + wait barrier in one judge call
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestContractAndBackgroundCompose:
|
||||
"""A contract goal blocked on a background process must surface BOTH
|
||||
the contract block and the background-process list to the judge, so it
|
||||
can return either done (evidence met) or wait (parked on the poller)."""
|
||||
|
||||
def _capture_client(self, captured, content='{"verdict": "wait", "wait_on_pid": 4242, "reason": "CI still running"}'):
|
||||
class _FakeMsg:
|
||||
pass
|
||||
_FakeMsg.content = content
|
||||
class _FakeChoice:
|
||||
message = _FakeMsg()
|
||||
class _FakeResp:
|
||||
choices = [_FakeChoice()]
|
||||
class _FakeClient:
|
||||
class chat:
|
||||
class completions:
|
||||
@staticmethod
|
||||
def create(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return _FakeResp()
|
||||
return _FakeClient
|
||||
|
||||
def test_judge_prompt_carries_contract_and_background(self, hermes_home):
|
||||
from unittest.mock import patch
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import GoalContract
|
||||
|
||||
captured = {}
|
||||
client = self._capture_client(captured)
|
||||
bg = [{
|
||||
"session_id": "ci-watch", "pid": 4242, "status": "running",
|
||||
"command": "wait_for_pr_green.sh 50501", "trigger": "exit",
|
||||
}]
|
||||
with patch("agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(client, "fake-model")), \
|
||||
patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None):
|
||||
verdict, reason, parse_failed, wait_directive = goals.judge_goal(
|
||||
"ship the PR",
|
||||
"I pushed and started the CI watcher; waiting on it now.",
|
||||
contract=GoalContract(verification="PR CI goes green"),
|
||||
background_processes=bg,
|
||||
)
|
||||
user_msg = next(
|
||||
(m["content"] for m in (captured.get("messages") or []) if m["role"] == "user"), ""
|
||||
)
|
||||
# Both surfaces present in one prompt.
|
||||
assert "completion contract" in user_msg.lower()
|
||||
assert "PR CI goes green" in user_msg
|
||||
assert "Background processes" in user_msg
|
||||
assert "4242" in user_msg
|
||||
# The judge can return a wait verdict on a contract goal.
|
||||
assert verdict == "wait"
|
||||
assert wait_directive and wait_directive.get("pid") == 4242
|
||||
|
||||
def test_contract_goal_can_still_complete_on_evidence(self, hermes_home):
|
||||
from unittest.mock import patch
|
||||
from hermes_cli import goals
|
||||
from hermes_cli.goals import GoalContract
|
||||
|
||||
captured = {}
|
||||
client = self._capture_client(
|
||||
captured,
|
||||
content='{"verdict": "done", "reason": "CI is green, evidence shown"}',
|
||||
)
|
||||
bg = [{"session_id": "ci", "pid": 4242, "status": "running", "command": "ci", "trigger": "exit"}]
|
||||
with patch("agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(client, "fake-model")), \
|
||||
patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None):
|
||||
verdict, reason, parse_failed, wait_directive = goals.judge_goal(
|
||||
"ship the PR",
|
||||
"CI finished: 30 passed, 0 failed. Done.",
|
||||
contract=GoalContract(verification="PR CI goes green"),
|
||||
background_processes=bg,
|
||||
)
|
||||
assert verdict == "done"
|
||||
assert wait_directive is None
|
||||
|
|
|
|||
|
|
@ -40,6 +40,8 @@ What you'll see:
|
|||
| Command | What it does |
|
||||
|---|---|
|
||||
| `/goal <text>` | Set (or replace) the standing goal. Kicks off the first turn immediately so you don't need to send a separate message. |
|
||||
| `/goal draft <text>` | Draft a structured completion contract from a plain-language objective, then set it. See [Completion contracts](#completion-contracts). |
|
||||
| `/goal show` | Print the active goal's completion contract. |
|
||||
| `/goal` or `/goal status` | Show the current goal, its status, and turns used. |
|
||||
| `/goal pause` | Stop the auto-continuation loop without clearing the goal. |
|
||||
| `/goal resume` | Resume the loop (resets the turn counter back to zero). |
|
||||
|
|
@ -49,6 +51,46 @@ What you'll see:
|
|||
|
||||
Works identically on the CLI and every gateway platform (Telegram, Discord, Slack, Matrix, Signal, WhatsApp, SMS, iMessage, Webhook, API server, and the web dashboard).
|
||||
|
||||
## Completion contracts
|
||||
|
||||
A bare `/goal <text>` works fine, but a *vague* goal makes for vague judging — the judge can only check what you told it to want. Codex's `/goal` guidance makes the same point: a durable objective works best when it names **what done means, how to prove it, what not to break, what's in scope, and when to stop**. Hermes adapts this as an optional **completion contract** layered on top of the existing goal loop.
|
||||
|
||||
A contract has five fields, all optional:
|
||||
|
||||
| Field | Meaning |
|
||||
|---|---|
|
||||
| `outcome` | The single end state that must be true when done. |
|
||||
| `verification` | The specific test / command / artifact that *proves* the outcome. |
|
||||
| `constraints` | What must not change or regress. |
|
||||
| `boundaries` | Which files, dirs, tools, or systems are in scope. |
|
||||
| `stop_when` | The condition under which Hermes should stop and ask for input. |
|
||||
|
||||
When a contract is set, both prompts change: the **continuation prompt** tells the agent to target the verification surface and respect the constraints, and the **judge prompt** decides `done` *only when the verification criterion is met with concrete evidence* (a command result, file excerpt, test output) — not a loose "looks done" claim. This directly tightens the most common `/goal` failure mode (premature completion or endless over-continuation on an underspecified objective).
|
||||
|
||||
### Two ways to set a contract
|
||||
|
||||
**1. Let Hermes draft it** (recommended — adapted from Codex's "let the agent draft the goal" tip):
|
||||
|
||||
```
|
||||
/goal draft Migrate the auth service from session cookies to JWT
|
||||
```
|
||||
|
||||
Hermes expands your one-liner into a full contract via the `goal_judge` auxiliary model, sets it, and shows you the result so you can review or tighten any field. If the aux model is unavailable, it falls back to a plain free-form goal — drafting never blocks setting a goal.
|
||||
|
||||
**2. Write it inline** with `field: value` lines:
|
||||
|
||||
```
|
||||
/goal Migrate auth to JWT
|
||||
verify: pytest tests/auth passes
|
||||
constraints: keep the /login response shape unchanged
|
||||
boundaries: only touch services/auth and its tests
|
||||
stop when: a DB schema migration is required
|
||||
```
|
||||
|
||||
The first non-field line(s) are the goal headline; recognized field prefixes (`verify:`, `verified by:`, `constraints:`, `preserve:`, `boundaries:`, `scope:`, `stop when:`, `blocked:`, …) populate the contract. A plain goal with an incidental colon (`Fix bug: the parser drops commas`) is **not** mangled — only known field prefixes are pulled out.
|
||||
|
||||
Use `/goal show` to review the active contract. Contracts persist in `SessionDB.state_meta` alongside the goal, so they survive `/resume`. Old goals from before this feature load unchanged (no contract). Contracts and `/subgoal` criteria compose: subgoals fold into the contract as extra criteria the judge must also satisfy.
|
||||
|
||||
## Adding criteria mid-goal: `/subgoal`
|
||||
|
||||
While a goal is active you can append extra acceptance criteria with `/subgoal <text>` without resetting the loop. Each call adds one numbered item to the goal's subgoal list; the **continuation prompt** the agent sees on the next turn includes the original goal plus an "Additional criteria the user added mid-loop" block, and the **judge prompt** is rewritten so the verdict must consider every subgoal — the goal isn't marked done until the original objective **and** every subgoal are met.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue