mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
* feat(goals): /subgoal — user-added criteria appended to active /goal Layers a /subgoal command on top of the existing freeform Ralph judge loop. The user can append extra criteria mid-loop; the judge factors them into its done/continue verdict and the continuation prompt surfaces them to the agent. No new tool, no agent self-judging — the existing judge model just sees a richer prompt. Forms: /subgoal show current subgoals /subgoal <text> append a criterion /subgoal remove <n> drop subgoal n (1-based) /subgoal clear wipe all subgoals How it integrates: - GoalState gains `subgoals: List[str]` (default []), backwards-compat for existing state_meta rows. - judge_goal accepts an optional subgoals kwarg; non-empty switches to JUDGE_USER_PROMPT_WITH_SUBGOALS_TEMPLATE which lists them as numbered criteria and asks 'is the goal AND every additional criterion satisfied?' - next_continuation_prompt picks CONTINUATION_PROMPT_WITH_SUBGOALS_TEMPLATE when non-empty so the agent sees what to target. - /subgoal is allowed mid-run on the gateway since it only touches the state the judge reads at turn boundary — no race with the running turn. - Status line shows '... , N subgoals' when present. Surface: - hermes_cli/goals.py — field, prompt blocks, manager methods, judge weave - hermes_cli/commands.py — /subgoal CommandDef - cli.py — _handle_subgoal_command - gateway/run.py — _handle_subgoal_command + mid-run dispatch - tests/hermes_cli/test_goals.py — 15 new tests (backcompat, mutation, persistence, prompt template selection, judge-prompt content via mock, status-line rendering) 77 goal-related tests passing across goals + cli + gateway + tui. * fix(goals): slash commands don't preempt the goal-continuation hook Two findings from live-testing /subgoal: 1. Slash commands queued while the agent is running landed in _pending_input (same queue as real user messages). The goal hook's 'is a real user message pending?' check returned True and silently skipped — but the slash command consumes its queue slot via process_command() which never re-fires the goal hook, so the loop stalls indefinitely. Now the hook peeks the queue and only defers when a non-slash payload is present. 2. The with-subgoals judge prompt was too soft — opus 4.7 said 'done, implying all requirements met' without verifying. Tightened to demand specific per-criterion evidence (file contents, output line, command result) and explicitly reject phrases like 'implying it was done.' Live verified: /subgoal injected mid-loop now correctly forces the judge to refuse done until the new criterion is met. Agent gets the continuation prompt with subgoals listed, updates the script, judge confirms done with specific evidence cited.
722 lines
29 KiB
Python
722 lines
29 KiB
Python
"""Persistent session goals — the Ralph loop for Hermes.
|
|
|
|
A goal is a free-form user objective that stays active across turns. After
|
|
each turn completes, a small judge call asks an auxiliary model "is this
|
|
goal satisfied by the assistant's last response?". If not, Hermes feeds a
|
|
continuation prompt back into the same session and keeps working until the
|
|
goal is done, turn budget is exhausted, the user pauses/clears it, or the
|
|
user sends a new message (which takes priority and pauses the goal loop).
|
|
|
|
State is persisted in SessionDB's ``state_meta`` table keyed by
|
|
``goal:<session_id>`` so ``/resume`` picks it up.
|
|
|
|
Design notes / invariants:
|
|
|
|
- The continuation prompt is just a normal user message appended to the
|
|
session via ``run_conversation``. No system-prompt mutation, no toolset
|
|
swap — prompt caching stays intact.
|
|
- Judge failures are fail-OPEN: ``continue``. A broken judge must not wedge
|
|
progress; the turn budget is the backstop.
|
|
- When a real user message arrives mid-loop it preempts the continuation
|
|
prompt and also pauses the goal loop for that turn (we still re-judge
|
|
after, so if the user's message happens to complete the goal the judge
|
|
will say ``done``).
|
|
- This module has zero hard dependency on ``cli.HermesCLI`` or the gateway
|
|
runner — both wire the same ``GoalManager`` in.
|
|
|
|
Nothing in this module touches the agent's system prompt or toolset.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import time
|
|
from dataclasses import dataclass, field, asdict
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Constants & defaults
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
DEFAULT_MAX_TURNS = 20
|
|
DEFAULT_JUDGE_TIMEOUT = 30.0
|
|
# Cap how much of the last response + recent messages we send to the judge.
|
|
_JUDGE_RESPONSE_SNIPPET_CHARS = 4000
|
|
# After this many consecutive judge *parse* failures (empty output / non-JSON),
|
|
# the loop auto-pauses and points the user at the goal_judge config. API /
|
|
# transport errors do NOT count toward this — those are transient. This guards
|
|
# against small models (e.g. deepseek-v4-flash) that cannot follow the strict
|
|
# JSON reply contract; without it the loop runs until the turn budget is
|
|
# exhausted with every reply shaped like `judge returned empty response` or
|
|
# `judge reply was not JSON`.
|
|
DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3
|
|
|
|
|
|
CONTINUATION_PROMPT_TEMPLATE = (
|
|
"[Continuing toward your standing goal]\n"
|
|
"Goal: {goal}\n\n"
|
|
"Continue working toward this goal. Take the next concrete step. "
|
|
"If you believe the goal is complete, state so explicitly and stop. "
|
|
"If you are blocked and need input from the user, say so clearly and stop."
|
|
)
|
|
|
|
# Used when the user has added one or more /subgoal criteria. Surfaced
|
|
# to the agent verbatim so it sees what to target on the next turn,
|
|
# and surfaced to the judge so the verdict considers them too.
|
|
CONTINUATION_PROMPT_WITH_SUBGOALS_TEMPLATE = (
|
|
"[Continuing toward your standing goal]\n"
|
|
"Goal: {goal}\n\n"
|
|
"Additional criteria the user added mid-loop:\n"
|
|
"{subgoals_block}\n\n"
|
|
"Continue working toward the goal AND all additional criteria. Take "
|
|
"the next concrete step. If you believe the goal and every "
|
|
"additional criterion are complete, state so explicitly and stop. "
|
|
"If you are blocked and need input from the user, say so clearly "
|
|
"and stop."
|
|
)
|
|
|
|
|
|
JUDGE_SYSTEM_PROMPT = (
|
|
"You are a strict judge evaluating whether an autonomous agent has "
|
|
"achieved a user's stated goal. You receive the goal text and the "
|
|
"agent's most recent response. Your only job is to decide whether "
|
|
"the goal is fully satisfied based on that response.\n\n"
|
|
"A goal is DONE only when:\n"
|
|
"- The response explicitly confirms the goal was completed, OR\n"
|
|
"- The response clearly shows the final deliverable was produced, OR\n"
|
|
"- The response explains the goal is unachievable / blocked / needs "
|
|
"user input (treat this as DONE with reason describing the block).\n\n"
|
|
"Otherwise the goal is NOT done — CONTINUE.\n\n"
|
|
"Reply ONLY with a single JSON object on one line:\n"
|
|
'{\"done\": <true|false>, \"reason\": \"<one-sentence rationale>\"}'
|
|
)
|
|
|
|
|
|
JUDGE_USER_PROMPT_TEMPLATE = (
|
|
"Goal:\n{goal}\n\n"
|
|
"Agent's most recent response:\n{response}\n\n"
|
|
"Is the goal satisfied?"
|
|
)
|
|
|
|
# Used when the user has added /subgoal criteria. The judge must
|
|
# evaluate ALL of them being met, not just the original goal.
|
|
JUDGE_USER_PROMPT_WITH_SUBGOALS_TEMPLATE = (
|
|
"Goal:\n{goal}\n\n"
|
|
"Additional criteria the user added mid-loop (all must also be "
|
|
"satisfied for the goal to be DONE):\n{subgoals_block}\n\n"
|
|
"Agent's most recent response:\n{response}\n\n"
|
|
"Decision: For each numbered criterion above, find concrete "
|
|
"evidence in the agent's response that the criterion is "
|
|
"satisfied. Do not accept generic phrases like 'all requirements "
|
|
"met' or 'implying it was done' — require specific evidence (a "
|
|
"file contents excerpt, an output line, a command result). If "
|
|
"ANY criterion lacks specific evidence in the response, the goal "
|
|
"is NOT done — return CONTINUE.\n\n"
|
|
"Is the goal AND every additional criterion satisfied?"
|
|
)
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Dataclass
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
@dataclass
|
|
class GoalState:
|
|
"""Serializable goal state stored per session."""
|
|
|
|
goal: str
|
|
status: str = "active" # active | paused | done | cleared
|
|
turns_used: int = 0
|
|
max_turns: int = DEFAULT_MAX_TURNS
|
|
created_at: float = 0.0
|
|
last_turn_at: float = 0.0
|
|
last_verdict: Optional[str] = None # "done" | "continue" | "skipped"
|
|
last_reason: Optional[str] = None
|
|
paused_reason: Optional[str] = None # why we auto-paused (budget, etc.)
|
|
consecutive_parse_failures: int = 0 # judge-output parse failures in a row
|
|
# User-added criteria appended mid-loop via the /subgoal command.
|
|
# When non-empty the judge prompt and continuation prompt both
|
|
# include them so the agent works toward them and the judge factors
|
|
# them into the verdict. Backwards-compatible: defaults to empty so
|
|
# old state_meta rows load unchanged.
|
|
subgoals: List[str] = field(default_factory=list)
|
|
|
|
def to_json(self) -> str:
|
|
return json.dumps(asdict(self), ensure_ascii=False)
|
|
|
|
@classmethod
|
|
def from_json(cls, raw: str) -> "GoalState":
|
|
data = json.loads(raw)
|
|
raw_subgoals = data.get("subgoals") or []
|
|
subgoals: List[str] = []
|
|
if isinstance(raw_subgoals, list):
|
|
subgoals = [str(s).strip() for s in raw_subgoals if str(s).strip()]
|
|
return cls(
|
|
goal=data.get("goal", ""),
|
|
status=data.get("status", "active"),
|
|
turns_used=int(data.get("turns_used", 0) or 0),
|
|
max_turns=int(data.get("max_turns", DEFAULT_MAX_TURNS) or DEFAULT_MAX_TURNS),
|
|
created_at=float(data.get("created_at", 0.0) or 0.0),
|
|
last_turn_at=float(data.get("last_turn_at", 0.0) or 0.0),
|
|
last_verdict=data.get("last_verdict"),
|
|
last_reason=data.get("last_reason"),
|
|
paused_reason=data.get("paused_reason"),
|
|
consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0),
|
|
subgoals=subgoals,
|
|
)
|
|
|
|
# --- subgoals helpers -------------------------------------------------
|
|
|
|
def render_subgoals_block(self) -> str:
|
|
"""Render the subgoals as a numbered ``- N. text`` block. Empty
|
|
when no subgoals exist."""
|
|
if not self.subgoals:
|
|
return ""
|
|
return "\n".join(f"- {i}. {text}" for i, text in enumerate(self.subgoals, start=1))
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Persistence (SessionDB state_meta)
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _meta_key(session_id: str) -> str:
|
|
return f"goal:{session_id}"
|
|
|
|
|
|
_DB_CACHE: Dict[str, Any] = {}
|
|
|
|
|
|
def _get_session_db() -> Optional[Any]:
|
|
"""Return a SessionDB instance for the current HERMES_HOME.
|
|
|
|
SessionDB has no built-in singleton, but opening a new connection per
|
|
/goal call would thrash the file. We cache one instance per
|
|
``hermes_home`` path so profile switches still pick up the right DB.
|
|
Defensive against import/instantiation failures so tests and
|
|
non-standard launchers can still use the GoalManager.
|
|
"""
|
|
try:
|
|
from hermes_constants import get_hermes_home
|
|
from hermes_state import SessionDB
|
|
|
|
home = str(get_hermes_home())
|
|
except Exception as exc: # pragma: no cover
|
|
logger.debug("GoalManager: SessionDB bootstrap failed (%s)", exc)
|
|
return None
|
|
|
|
cached = _DB_CACHE.get(home)
|
|
if cached is not None:
|
|
return cached
|
|
try:
|
|
db = SessionDB()
|
|
except Exception as exc: # pragma: no cover
|
|
logger.debug("GoalManager: SessionDB() raised (%s)", exc)
|
|
return None
|
|
_DB_CACHE[home] = db
|
|
return db
|
|
|
|
|
|
def load_goal(session_id: str) -> Optional[GoalState]:
|
|
"""Load the goal for a session, or None if none exists."""
|
|
if not session_id:
|
|
return None
|
|
db = _get_session_db()
|
|
if db is None:
|
|
return None
|
|
try:
|
|
raw = db.get_meta(_meta_key(session_id))
|
|
except Exception as exc:
|
|
logger.debug("GoalManager: get_meta failed: %s", exc)
|
|
return None
|
|
if not raw:
|
|
return None
|
|
try:
|
|
return GoalState.from_json(raw)
|
|
except Exception as exc:
|
|
logger.warning("GoalManager: could not parse stored goal for %s: %s", session_id, exc)
|
|
return None
|
|
|
|
|
|
def save_goal(session_id: str, state: GoalState) -> None:
|
|
"""Persist a goal to SessionDB. No-op if DB unavailable."""
|
|
if not session_id:
|
|
return
|
|
db = _get_session_db()
|
|
if db is None:
|
|
return
|
|
try:
|
|
db.set_meta(_meta_key(session_id), state.to_json())
|
|
except Exception as exc:
|
|
logger.debug("GoalManager: set_meta failed: %s", exc)
|
|
|
|
|
|
def clear_goal(session_id: str) -> None:
|
|
"""Mark a goal cleared in the DB (preserved for audit, status=cleared)."""
|
|
state = load_goal(session_id)
|
|
if state is None:
|
|
return
|
|
state.status = "cleared"
|
|
save_goal(session_id, state)
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Judge
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _truncate(text: str, limit: int) -> str:
|
|
if not text:
|
|
return ""
|
|
if len(text) <= limit:
|
|
return text
|
|
return text[:limit] + "… [truncated]"
|
|
|
|
|
|
_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)
|
|
|
|
|
|
def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
|
|
"""Parse the judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.
|
|
|
|
Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the
|
|
judge returned output that couldn't be interpreted as the expected JSON
|
|
verdict (empty body, prose, malformed JSON). Callers use that flag to
|
|
auto-pause after N consecutive parse failures so a weak judge model
|
|
doesn't silently burn the turn budget.
|
|
"""
|
|
if not raw:
|
|
return False, "judge returned empty response", True
|
|
|
|
text = raw.strip()
|
|
|
|
# Strip markdown code fences the model may wrap JSON in.
|
|
if text.startswith("```"):
|
|
text = text.strip("`")
|
|
# Peel off leading json/JSON/etc tag
|
|
nl = text.find("\n")
|
|
if nl != -1:
|
|
text = text[nl + 1:]
|
|
|
|
# First try: parse the whole blob.
|
|
data: Optional[Dict[str, Any]] = None
|
|
try:
|
|
data = json.loads(text)
|
|
except Exception:
|
|
# Second try: pull the first JSON object out.
|
|
match = _JSON_OBJECT_RE.search(text)
|
|
if match:
|
|
try:
|
|
data = json.loads(match.group(0))
|
|
except Exception:
|
|
data = None
|
|
|
|
if not isinstance(data, dict):
|
|
return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True
|
|
|
|
done_val = data.get("done")
|
|
if isinstance(done_val, str):
|
|
done = done_val.strip().lower() in {"true", "yes", "1", "done"}
|
|
else:
|
|
done = bool(done_val)
|
|
reason = str(data.get("reason") or "").strip()
|
|
if not reason:
|
|
reason = "no reason provided"
|
|
return done, reason, False
|
|
|
|
|
|
def judge_goal(
|
|
goal: str,
|
|
last_response: str,
|
|
*,
|
|
timeout: float = DEFAULT_JUDGE_TIMEOUT,
|
|
subgoals: Optional[List[str]] = None,
|
|
) -> Tuple[str, str, bool]:
|
|
"""Ask the auxiliary model whether the goal is satisfied.
|
|
|
|
Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``,
|
|
``"continue"``, or ``"skipped"`` (when the judge couldn't be reached).
|
|
|
|
``parse_failed`` is True only when the judge call succeeded but its output
|
|
was unusable (empty or non-JSON). API/transport errors return False — they
|
|
are transient and should fail-open silently. Callers use this flag to
|
|
auto-pause after N consecutive parse failures (see
|
|
``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``).
|
|
|
|
``subgoals`` is an optional list of user-added criteria (from
|
|
``/subgoal``) that the judge must also factor into its DONE/CONTINUE
|
|
decision. When non-empty the prompt switches to the with-subgoals
|
|
template; otherwise behavior is identical to the original judge.
|
|
|
|
This is deliberately fail-open: any error returns ``("continue", "...", False)``
|
|
so a broken judge doesn't wedge progress — the turn budget and the
|
|
consecutive-parse-failures auto-pause are the backstops.
|
|
"""
|
|
if not goal.strip():
|
|
return "skipped", "empty goal", False
|
|
if not last_response.strip():
|
|
# No substantive reply this turn — almost certainly not done yet.
|
|
return "continue", "empty response (nothing to evaluate)", False
|
|
|
|
try:
|
|
from agent.auxiliary_client import get_auxiliary_extra_body, get_text_auxiliary_client
|
|
except Exception as exc:
|
|
logger.debug("goal judge: auxiliary client import failed: %s", exc)
|
|
return "continue", "auxiliary client unavailable", False
|
|
|
|
try:
|
|
client, model = get_text_auxiliary_client("goal_judge")
|
|
except Exception as exc:
|
|
logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc)
|
|
return "continue", "auxiliary client unavailable", False
|
|
|
|
if client is None or not model:
|
|
return "continue", "no auxiliary client configured", False
|
|
|
|
# Build the prompt — pick the with-subgoals variant when applicable.
|
|
clean_subgoals = [s.strip() for s in (subgoals or []) if s and s.strip()]
|
|
if clean_subgoals:
|
|
subgoals_block = "\n".join(
|
|
f"- {i}. {text}" for i, text in enumerate(clean_subgoals, start=1)
|
|
)
|
|
prompt = JUDGE_USER_PROMPT_WITH_SUBGOALS_TEMPLATE.format(
|
|
goal=_truncate(goal, 2000),
|
|
subgoals_block=_truncate(subgoals_block, 2000),
|
|
response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS),
|
|
)
|
|
else:
|
|
prompt = JUDGE_USER_PROMPT_TEMPLATE.format(
|
|
goal=_truncate(goal, 2000),
|
|
response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS),
|
|
)
|
|
|
|
try:
|
|
resp = client.chat.completions.create(
|
|
model=model,
|
|
messages=[
|
|
{"role": "system", "content": JUDGE_SYSTEM_PROMPT},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
temperature=0,
|
|
max_tokens=200,
|
|
timeout=timeout,
|
|
extra_body=get_auxiliary_extra_body() or None,
|
|
)
|
|
except Exception as exc:
|
|
logger.info("goal judge: API call failed (%s) — falling through to continue", exc)
|
|
return "continue", f"judge error: {type(exc).__name__}", False
|
|
|
|
try:
|
|
raw = resp.choices[0].message.content or ""
|
|
except Exception:
|
|
raw = ""
|
|
|
|
done, reason, parse_failed = _parse_judge_response(raw)
|
|
verdict = "done" if done else "continue"
|
|
logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120))
|
|
return verdict, reason, parse_failed
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# GoalManager — the orchestration surface CLI + gateway talk to
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class GoalManager:
|
|
"""Per-session goal state + continuation decisions.
|
|
|
|
The CLI and gateway each hold one ``GoalManager`` per live session.
|
|
|
|
Methods:
|
|
|
|
- ``set(goal)`` — start a new standing goal.
|
|
- ``clear()`` — remove the active goal.
|
|
- ``pause()`` / ``resume()`` — explicit user controls.
|
|
- ``status()`` — printable one-liner.
|
|
- ``evaluate_after_turn(last_response)`` — call the judge, update state,
|
|
and return a decision dict the caller uses to drive the next turn.
|
|
- ``next_continuation_prompt()`` — the canonical user-role message to
|
|
feed back into ``run_conversation``.
|
|
"""
|
|
|
|
def __init__(self, session_id: str, *, default_max_turns: int = DEFAULT_MAX_TURNS):
|
|
self.session_id = session_id
|
|
self.default_max_turns = int(default_max_turns or DEFAULT_MAX_TURNS)
|
|
self._state: Optional[GoalState] = load_goal(session_id)
|
|
|
|
# --- introspection ------------------------------------------------
|
|
|
|
@property
|
|
def state(self) -> Optional[GoalState]:
|
|
return self._state
|
|
|
|
def is_active(self) -> bool:
|
|
return self._state is not None and self._state.status == "active"
|
|
|
|
def has_goal(self) -> bool:
|
|
return self._state is not None and self._state.status in {"active", "paused"}
|
|
|
|
def status_line(self) -> str:
|
|
s = self._state
|
|
if s is None or s.status in {"cleared",}:
|
|
return "No active goal. Set one with /goal <text>."
|
|
turns = f"{s.turns_used}/{s.max_turns} turns"
|
|
sub = f", {len(s.subgoals)} subgoal{'s' if len(s.subgoals) != 1 else ''}" if s.subgoals else ""
|
|
if s.status == "active":
|
|
return f"⊙ Goal (active, {turns}{sub}): {s.goal}"
|
|
if s.status == "paused":
|
|
extra = f" — {s.paused_reason}" if s.paused_reason else ""
|
|
return f"⏸ Goal (paused, {turns}{sub}{extra}): {s.goal}"
|
|
if s.status == "done":
|
|
return f"✓ Goal done ({turns}{sub}): {s.goal}"
|
|
return f"Goal ({s.status}, {turns}{sub}): {s.goal}"
|
|
|
|
# --- mutation -----------------------------------------------------
|
|
|
|
def set(self, goal: str, *, max_turns: Optional[int] = None) -> GoalState:
|
|
goal = (goal or "").strip()
|
|
if not goal:
|
|
raise ValueError("goal text is empty")
|
|
state = GoalState(
|
|
goal=goal,
|
|
status="active",
|
|
turns_used=0,
|
|
max_turns=int(max_turns) if max_turns else self.default_max_turns,
|
|
created_at=time.time(),
|
|
last_turn_at=0.0,
|
|
)
|
|
self._state = state
|
|
save_goal(self.session_id, state)
|
|
return state
|
|
|
|
def pause(self, reason: str = "user-paused") -> Optional[GoalState]:
|
|
if not self._state:
|
|
return None
|
|
self._state.status = "paused"
|
|
self._state.paused_reason = reason
|
|
save_goal(self.session_id, self._state)
|
|
return self._state
|
|
|
|
def resume(self, *, reset_budget: bool = True) -> Optional[GoalState]:
|
|
if not self._state:
|
|
return None
|
|
self._state.status = "active"
|
|
self._state.paused_reason = None
|
|
if reset_budget:
|
|
self._state.turns_used = 0
|
|
save_goal(self.session_id, self._state)
|
|
return self._state
|
|
|
|
def clear(self) -> None:
|
|
if self._state is None:
|
|
return
|
|
self._state.status = "cleared"
|
|
save_goal(self.session_id, self._state)
|
|
self._state = None
|
|
|
|
def mark_done(self, reason: str) -> None:
|
|
if not self._state:
|
|
return
|
|
self._state.status = "done"
|
|
self._state.last_verdict = "done"
|
|
self._state.last_reason = reason
|
|
save_goal(self.session_id, self._state)
|
|
|
|
# --- /subgoal user controls ---------------------------------------
|
|
|
|
def add_subgoal(self, text: str) -> str:
|
|
"""Append a user-added criterion to the active goal. Requires
|
|
``has_goal()``; raises ``RuntimeError`` otherwise.
|
|
|
|
Returns the cleaned text so the caller can show it back to the user.
|
|
"""
|
|
if self._state is None or not self.has_goal():
|
|
raise RuntimeError("no active goal")
|
|
text = (text or "").strip()
|
|
if not text:
|
|
raise ValueError("subgoal text is empty")
|
|
self._state.subgoals.append(text)
|
|
save_goal(self.session_id, self._state)
|
|
return text
|
|
|
|
def remove_subgoal(self, index_1based: int) -> str:
|
|
"""Remove a subgoal by 1-based index. Returns the removed text."""
|
|
if self._state is None or not self.has_goal():
|
|
raise RuntimeError("no active goal")
|
|
idx = int(index_1based) - 1
|
|
if idx < 0 or idx >= len(self._state.subgoals):
|
|
raise IndexError(
|
|
f"index out of range (1..{len(self._state.subgoals)})"
|
|
)
|
|
removed = self._state.subgoals.pop(idx)
|
|
save_goal(self.session_id, self._state)
|
|
return removed
|
|
|
|
def clear_subgoals(self) -> int:
|
|
"""Wipe all subgoals. Returns the previous count."""
|
|
if self._state is None or not self.has_goal():
|
|
raise RuntimeError("no active goal")
|
|
prev = len(self._state.subgoals)
|
|
self._state.subgoals = []
|
|
save_goal(self.session_id, self._state)
|
|
return prev
|
|
|
|
def render_subgoals(self) -> str:
|
|
"""Public helper for the /subgoal slash command."""
|
|
if self._state is None:
|
|
return "(no active goal)"
|
|
if not self._state.subgoals:
|
|
return "(no subgoals — use /subgoal <text> to add criteria)"
|
|
return self._state.render_subgoals_block()
|
|
|
|
# --- the main entry point called after every turn -----------------
|
|
|
|
def evaluate_after_turn(
|
|
self,
|
|
last_response: str,
|
|
*,
|
|
user_initiated: bool = True,
|
|
) -> Dict[str, Any]:
|
|
"""Run the judge and update state. Return a decision dict.
|
|
|
|
``user_initiated`` distinguishes a real user prompt (True) from a
|
|
continuation prompt we fed ourselves (False). Both increment
|
|
``turns_used`` because both consume model budget.
|
|
|
|
Decision keys:
|
|
- ``status``: current goal status after update
|
|
- ``should_continue``: bool — caller should fire another turn
|
|
- ``continuation_prompt``: str or None
|
|
- ``verdict``: "done" | "continue" | "skipped" | "inactive"
|
|
- ``reason``: str
|
|
- ``message``: user-visible one-liner to print/send
|
|
"""
|
|
state = self._state
|
|
if state is None or state.status != "active":
|
|
return {
|
|
"status": state.status if state else None,
|
|
"should_continue": False,
|
|
"continuation_prompt": None,
|
|
"verdict": "inactive",
|
|
"reason": "no active goal",
|
|
"message": "",
|
|
}
|
|
|
|
# Count the turn that just finished.
|
|
state.turns_used += 1
|
|
state.last_turn_at = time.time()
|
|
|
|
verdict, reason, parse_failed = judge_goal(
|
|
state.goal, last_response, subgoals=state.subgoals or None
|
|
)
|
|
state.last_verdict = verdict
|
|
state.last_reason = reason
|
|
|
|
# Track consecutive judge parse failures. Reset on any usable reply,
|
|
# including API / transport errors (parse_failed=False) so a flaky
|
|
# network doesn't trip the auto-pause meant for bad judge models.
|
|
if parse_failed:
|
|
state.consecutive_parse_failures += 1
|
|
else:
|
|
state.consecutive_parse_failures = 0
|
|
|
|
if verdict == "done":
|
|
state.status = "done"
|
|
save_goal(self.session_id, state)
|
|
return {
|
|
"status": "done",
|
|
"should_continue": False,
|
|
"continuation_prompt": None,
|
|
"verdict": "done",
|
|
"reason": reason,
|
|
"message": f"✓ Goal achieved: {reason}",
|
|
}
|
|
|
|
# Auto-pause when the judge model can't produce the expected JSON
|
|
# verdict N turns in a row. Points the user at the goal_judge config
|
|
# so they can route this side task to a model that follows the
|
|
# contract (e.g. google/gemini-3-flash-preview). Without this guard,
|
|
# weak judge models burn the entire turn budget returning prose or
|
|
# empty strings.
|
|
if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES:
|
|
state.status = "paused"
|
|
state.paused_reason = (
|
|
f"judge model returned unparseable output {state.consecutive_parse_failures} turns in a row"
|
|
)
|
|
save_goal(self.session_id, state)
|
|
return {
|
|
"status": "paused",
|
|
"should_continue": False,
|
|
"continuation_prompt": None,
|
|
"verdict": "continue",
|
|
"reason": reason,
|
|
"message": (
|
|
f"⏸ Goal paused — the judge model ({state.consecutive_parse_failures} turns) "
|
|
"isn't returning the required JSON verdict. Route the judge to a stricter "
|
|
"model in ~/.hermes/config.yaml:\n"
|
|
" auxiliary:\n"
|
|
" goal_judge:\n"
|
|
" provider: openrouter\n"
|
|
" model: google/gemini-3-flash-preview\n"
|
|
"Then /goal resume to continue."
|
|
),
|
|
}
|
|
|
|
if state.turns_used >= state.max_turns:
|
|
state.status = "paused"
|
|
state.paused_reason = f"turn budget exhausted ({state.turns_used}/{state.max_turns})"
|
|
save_goal(self.session_id, state)
|
|
return {
|
|
"status": "paused",
|
|
"should_continue": False,
|
|
"continuation_prompt": None,
|
|
"verdict": "continue",
|
|
"reason": reason,
|
|
"message": (
|
|
f"⏸ Goal paused — {state.turns_used}/{state.max_turns} turns used. "
|
|
"Use /goal resume to keep going, or /goal clear to stop."
|
|
),
|
|
}
|
|
|
|
save_goal(self.session_id, state)
|
|
return {
|
|
"status": "active",
|
|
"should_continue": True,
|
|
"continuation_prompt": self.next_continuation_prompt(),
|
|
"verdict": "continue",
|
|
"reason": reason,
|
|
"message": (
|
|
f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}): {reason}"
|
|
),
|
|
}
|
|
|
|
def next_continuation_prompt(self) -> Optional[str]:
|
|
if not self._state or self._state.status != "active":
|
|
return None
|
|
if self._state.subgoals:
|
|
return CONTINUATION_PROMPT_WITH_SUBGOALS_TEMPLATE.format(
|
|
goal=self._state.goal,
|
|
subgoals_block=self._state.render_subgoals_block(),
|
|
)
|
|
return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal)
|
|
|
|
|
|
__all__ = [
|
|
"GoalState",
|
|
"GoalManager",
|
|
"CONTINUATION_PROMPT_TEMPLATE",
|
|
"CONTINUATION_PROMPT_WITH_SUBGOALS_TEMPLATE",
|
|
"JUDGE_USER_PROMPT_TEMPLATE",
|
|
"JUDGE_USER_PROMPT_WITH_SUBGOALS_TEMPLATE",
|
|
"DEFAULT_MAX_TURNS",
|
|
"load_goal",
|
|
"save_goal",
|
|
"clear_goal",
|
|
"judge_goal",
|
|
]
|