diff --git a/cli.py b/cli.py index dbbf83f2c0..9ff6b8708a 100644 --- a/cli.py +++ b/cli.py @@ -6540,6 +6540,8 @@ class HermesCLI: # No active run — treat as a normal next-turn message. self._pending_input.put(payload) _cprint(f" No agent running; queued as next turn: {payload[:80]}{'...' if len(payload) > 80 else ''}") + elif canonical == "goal": + self._handle_goal_command(cmd_original) elif canonical == "skin": self._handle_skin_command(cmd_original) elif canonical == "voice": @@ -7020,6 +7022,166 @@ class HermesCLI: print(" status Show current browser mode") print() + # ──────────────────────────────────────────────────────────────── + # /goal — persistent cross-turn goals (Ralph-style loop) + # ──────────────────────────────────────────────────────────────── + def _get_goal_manager(self): + """Return the GoalManager bound to the current session_id. + + Cached on ``self._goal_manager`` and rebound lazily when + ``session_id`` changes (e.g. after /new or a compression-driven + session split). + """ + try: + from hermes_cli.goals import GoalManager + from hermes_cli.config import load_config + except Exception as exc: + logging.debug("goal manager unavailable: %s", exc) + return None + + sid = getattr(self, "session_id", None) or "" + if not sid: + return None + + existing = getattr(self, "_goal_manager", None) + if existing is not None and getattr(existing, "session_id", None) == sid: + return existing + + try: + cfg = load_config() or {} + goals_cfg = cfg.get("goals") or {} + max_turns = int(goals_cfg.get("max_turns", 20) or 20) + except Exception: + max_turns = 20 + + mgr = GoalManager(session_id=sid, default_max_turns=max_turns) + self._goal_manager = mgr + return mgr + + def _handle_goal_command(self, cmd: str) -> None: + """Dispatch /goal subcommands: set / status / pause / resume / clear.""" + parts = (cmd or "").strip().split(None, 1) + arg = parts[1].strip() if len(parts) > 1 else "" + + mgr = self._get_goal_manager() + if mgr is None: + _cprint(f" {_DIM}Goals unavailable (no active session).{_RST}") + return + + lower = arg.lower() + + # Bare /goal or /goal status → show current state + if not arg or lower == "status": + _cprint(f" {mgr.status_line()}") + return + + if lower == "pause": + state = mgr.pause(reason="user-paused") + if state is None: + _cprint(f" {_DIM}No goal set.{_RST}") + else: + _cprint(f" ⏸ Goal paused: {state.goal}") + return + + if lower == "resume": + state = mgr.resume() + if state is None: + _cprint(f" {_DIM}No goal to resume.{_RST}") + else: + _cprint(f" ▶ Goal resumed: {state.goal}") + _cprint( + f" {_DIM}Send any message (or press Enter on an empty prompt " + f"is a no-op; type 'continue' to kick it off).{_RST}" + ) + return + + if lower in ("clear", "stop", "done"): + had = mgr.has_goal() + mgr.clear() + if had: + _cprint(" ✓ Goal cleared.") + else: + _cprint(f" {_DIM}No active goal.{_RST}") + return + + # Otherwise treat the arg as the goal text. + try: + state = mgr.set(arg) + except ValueError as exc: + _cprint(f" Invalid goal: {exc}") + return + + _cprint(f" ⊙ Goal set ({state.max_turns}-turn budget): {state.goal}") + _cprint( + f" {_DIM}After each turn, a judge model will check if the goal is done. " + f"Hermes keeps working until it is, you pause/clear it, or the budget is " + f"exhausted. Use /goal status, /goal pause, /goal resume, /goal clear.{_RST}" + ) + # Kick the loop off immediately so the user doesn't have to send a + # separate message after setting the goal. + try: + self._pending_input.put(state.goal) + except Exception: + pass + + def _maybe_continue_goal_after_turn(self) -> None: + """Hook run after every CLI turn. Judges + maybe re-queues. + + Safe to call when no goal is set — returns quickly. + + Preemption is automatic: if a real user message is already in + ``_pending_input`` we skip judging (the user's new input takes + priority and we'll re-judge after that turn). If judge says done, + mark it done and tell the user. If judge says continue and we're + under budget, push the continuation prompt onto the queue. + """ + mgr = self._get_goal_manager() + if mgr is None or not mgr.is_active(): + return + + # If a real user message is already queued, don't inject a + # continuation prompt on top — let the user's turn go first. + try: + if getattr(self, "_pending_input", None) is not None \ + and not self._pending_input.empty(): + return + except Exception: + pass + + # Extract the agent's final response for this turn. + last_response = "" + try: + hist = self.conversation_history or [] + for msg in reversed(hist): + if msg.get("role") == "assistant": + content = msg.get("content", "") + if isinstance(content, list): + # Multimodal content — flatten text parts. + parts = [ + p.get("text", "") + for p in content + if isinstance(p, dict) and p.get("type") in ("text", "output_text") + ] + last_response = "\n".join(t for t in parts if t) + else: + last_response = str(content or "") + break + except Exception: + last_response = "" + + decision = mgr.evaluate_after_turn(last_response, user_initiated=True) + msg = decision.get("message") or "" + if msg: + _cprint(f" {msg}") + + if decision.get("should_continue"): + prompt = decision.get("continuation_prompt") + if prompt: + try: + self._pending_input.put(prompt) + except Exception as exc: + logging.debug("goal continuation enqueue failed: %s", exc) + def _handle_skin_command(self, cmd: str): """Handle /skin [name] — show or change the display skin.""" try: @@ -11358,6 +11520,17 @@ class HermesCLI: app.invalidate() # Refresh status line + # Goal continuation: if a standing goal is active, ask + # the judge whether the turn satisfied it. If not, and + # there's no real user message already queued, push the + # continuation prompt back into _pending_input so the + # next loop iteration picks it up naturally (and any + # user input that arrives in between still preempts). + try: + self._maybe_continue_goal_after_turn() + except Exception as _goal_exc: + logging.debug("goal continuation hook failed: %s", _goal_exc) + # Continuous voice: auto-restart recording after agent responds. # Dispatch to a daemon thread so play_beep (sd.wait) and # AudioRecorder.start (lock acquire) never block process_loop — diff --git a/gateway/run.py b/gateway/run.py index d991ac4ff8..de04099c3a 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -4595,6 +4595,17 @@ class GatewayRunner: if _cmd_def_inner and _cmd_def_inner.name == "kanban": return await self._handle_kanban_command(event) + # /goal is safe mid-run for status/pause/clear (inspection and + # control-plane only — doesn't interrupt the running turn). + # Setting a new goal text mid-run is rejected with the same + # "wait or /stop" message as /model so we don't race a second + # continuation prompt against the current turn. + if _cmd_def_inner and _cmd_def_inner.name == "goal": + _goal_arg = (event.get_command_args() or "").strip().lower() + if not _goal_arg or _goal_arg in ("status", "pause", "resume", "clear", "stop", "done"): + return await self._handle_goal_command(event) + return "Agent is running — use /goal status / pause / clear mid-run, or /stop before setting a new goal." + # Session-level toggles that are safe to run mid-agent — # /yolo can unblock a pending approval prompt, /verbose cycles # the tool-progress display mode for the ongoing stream. @@ -4911,6 +4922,9 @@ class GatewayRunner: # at the end of this function so the rewritten text is sent # to the agent as a regular user turn. + if canonical == "goal": + return await self._handle_goal_command(event) + if canonical == "voice": return await self._handle_voice_command(event) @@ -5056,7 +5070,36 @@ class GatewayRunner: _run_generation = self._begin_session_run_generation(_quick_key) try: - return await self._handle_message_with_agent(event, source, _quick_key, _run_generation) + _agent_result = await self._handle_message_with_agent(event, source, _quick_key, _run_generation) + # Goal continuation: after the agent returns a final response + # for this turn, check any standing /goal — the judge will + # either mark it done, pause it (budget), or enqueue a + # continuation prompt back through the adapter FIFO so the + # next turn makes more progress. Wrapped in try/except so a + # broken judge never breaks normal message handling. + try: + _final_text = "" + if isinstance(_agent_result, dict): + _final_text = str(_agent_result.get("final_response") or "") + elif isinstance(_agent_result, str): + _final_text = _agent_result + # Skip for empty responses (interrupted / errored) — the + # judge would almost always say "continue" and we'd loop + # on error. Let the user drive the next turn. + if _final_text.strip(): + try: + session_entry = self.session_store.get_or_create_session(source) + except Exception: + session_entry = None + if session_entry is not None: + self._post_turn_goal_continuation( + session_entry=session_entry, + source=source, + final_response=_final_text, + ) + except Exception as _goal_exc: + logger.debug("goal continuation hook failed: %s", _goal_exc) + return _agent_result finally: # If _run_agent replaced the sentinel with a real agent and # then cleaned it up, this is a no-op. If we exited early @@ -7422,6 +7465,201 @@ class GatewayRunner: # Let the normal message handler process it return await self._handle_message(retry_event) + # ──────────────────────────────────────────────────────────────── + # /goal — persistent cross-turn goals (Ralph-style loop) + # ──────────────────────────────────────────────────────────────── + def _get_goal_manager_for_event(self, event: "MessageEvent"): + """Return a GoalManager bound to the session for this gateway event. + + Returns ``(manager, session_entry)`` or ``(None, None)`` if the + goals module can't be loaded. + """ + try: + from hermes_cli.goals import GoalManager + except Exception as exc: + logger.debug("goal manager unavailable: %s", exc) + return None, None + try: + session_entry = self.session_store.get_or_create_session(event.source) + except Exception as exc: + logger.debug("goal manager: session lookup failed: %s", exc) + return None, None + sid = getattr(session_entry, "session_id", None) or "" + if not sid: + return None, None + try: + goals_cfg = ( + (self.config or {}).get("goals", {}) + if isinstance(self.config, dict) + else getattr(self.config, "goals", {}) or {} + ) + max_turns = int(goals_cfg.get("max_turns", 20) or 20) + except Exception: + max_turns = 20 + return GoalManager(session_id=sid, default_max_turns=max_turns), session_entry + + async def _handle_goal_command(self, event: "MessageEvent") -> str: + """Handle /goal for gateway platforms. + + Subcommands: ``/goal`` / ``/goal status`` / ``/goal pause`` / + ``/goal resume`` / ``/goal clear``. Any other text becomes the + new goal. + + Setting a new goal queues the goal text as the next turn so the + agent starts working on it immediately — the post-turn + continuation hook then takes over from there. + """ + args = (event.get_command_args() or "").strip() + lower = args.lower() + + mgr, session_entry = self._get_goal_manager_for_event(event) + if mgr is None: + return "Goals unavailable on this session." + + if not args or lower == "status": + return mgr.status_line() + + if lower == "pause": + state = mgr.pause(reason="user-paused") + if state is None: + return "No goal set." + return f"⏸ Goal paused: {state.goal}" + + if lower == "resume": + state = mgr.resume() + if state is None: + return "No goal to resume." + return ( + f"▶ Goal resumed: {state.goal}\n" + "Send any message to continue, or wait — I'll take the next step on the next turn." + ) + + if lower in ("clear", "stop", "done"): + had = mgr.has_goal() + mgr.clear() + return "✓ Goal cleared." if had else "No active goal." + + # Otherwise — treat the remaining text as the new goal. + try: + state = mgr.set(args) + except ValueError as exc: + return f"Invalid goal: {exc}" + + # Queue the goal text as an immediate first turn so the agent + # starts making progress. The post-turn hook takes over after. + adapter = self.adapters.get(event.source.platform) if event.source else None + _quick_key = self._session_key_for_source(event.source) if event.source else None + if adapter and _quick_key: + try: + kickoff_event = MessageEvent( + text=state.goal, + message_type=MessageType.TEXT, + source=event.source, + message_id=event.message_id, + channel_prompt=event.channel_prompt, + ) + self._enqueue_fifo(_quick_key, kickoff_event, adapter) + except Exception as exc: + logger.debug("goal kickoff enqueue failed: %s", exc) + + return ( + f"⊙ Goal set ({state.max_turns}-turn budget): {state.goal}\n" + "I'll keep working until the goal is done, you pause/clear it, or the budget is exhausted.\n" + "Controls: /goal status · /goal pause · /goal resume · /goal clear" + ) + + def _post_turn_goal_continuation( + self, + *, + session_entry: Any, + source: Any, + final_response: str, + ) -> None: + """Run the goal judge after a gateway turn and, if still active, + enqueue a continuation prompt for the same session. + + Called from ``_handle_message_with_agent`` at turn boundary, AFTER + the response has been delivered. Safe when no goal is set. + + We use the adapter's pending-message / FIFO machinery so any real + user message that arrives simultaneously is handled by the same + queue and takes priority naturally. + """ + try: + from hermes_cli.goals import GoalManager + except Exception as exc: + logger.debug("goal continuation: goals module unavailable: %s", exc) + return + + sid = getattr(session_entry, "session_id", None) or "" + if not sid: + return + + try: + goals_cfg = ( + (self.config or {}).get("goals", {}) + if isinstance(self.config, dict) + else getattr(self.config, "goals", {}) or {} + ) + max_turns = int(goals_cfg.get("max_turns", 20) or 20) + except Exception: + max_turns = 20 + + mgr = GoalManager(session_id=sid, default_max_turns=max_turns) + if not mgr.is_active(): + return + + decision = mgr.evaluate_after_turn(final_response or "", user_initiated=True) + msg = decision.get("message") or "" + + # Send the status line back to the user so they see the judge's + # verdict. Fire-and-forget via the adapter. + if msg and source is not None: + try: + adapter = self.adapters.get(source.platform) + if adapter and hasattr(adapter, "send_message"): + import asyncio as _asyncio + coro = adapter.send_message(source, msg) + if _asyncio.iscoroutine(coro): + try: + loop = _asyncio.get_event_loop() + if loop.is_running(): + loop.create_task(coro) + else: + loop.run_until_complete(coro) + except RuntimeError: + # No event loop in this thread — schedule on the main one. + try: + _asyncio.run_coroutine_threadsafe(coro, self._loop) + except Exception: + pass + except Exception as exc: + logger.debug("goal continuation: status send failed: %s", exc) + + if not decision.get("should_continue"): + return + + prompt = decision.get("continuation_prompt") or "" + if not prompt or source is None: + return + + # Enqueue via the adapter's FIFO so a user message already in + # flight preempts the continuation naturally. + try: + adapter = self.adapters.get(source.platform) + _quick_key = self._session_key_for_source(source) + if adapter and _quick_key: + cont_event = MessageEvent( + text=prompt, + message_type=MessageType.TEXT, + source=source, + message_id=None, + channel_prompt=None, + ) + self._enqueue_fifo(_quick_key, cont_event, adapter) + except Exception as exc: + logger.debug("goal continuation: enqueue failed: %s", exc) + async def _handle_undo_command(self, event: MessageEvent) -> str: """Handle /undo command - remove the last user/assistant exchange.""" source = event.source diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index 2acffe331a..ce2d9eaaa2 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -95,6 +95,8 @@ COMMAND_REGISTRY: list[CommandDef] = [ aliases=("q",), args_hint=""), CommandDef("steer", "Inject a message after the next tool call without interrupting", "Session", args_hint=""), + CommandDef("goal", "Set a standing goal Hermes works on across turns until achieved", "Session", + args_hint="[text | pause | resume | clear | status]"), CommandDef("status", "Show session info", "Session"), CommandDef("profile", "Show active profile name and home directory", "Info"), CommandDef("sethome", "Set this chat as the home channel", "Session", diff --git a/hermes_cli/config.py b/hermes_cli/config.py index df1a5943f7..720405935b 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -952,7 +952,23 @@ DEFAULT_CONFIG = { # injected at the start of every API call for few-shot priming. # Never saved to sessions, logs, or trajectories. "prefill_messages_file": "", - + + # Goals — persistent cross-turn goals (Ralph-style loop). + # After every turn, a lightweight judge call asks the auxiliary model + # whether the active /goal is satisfied by the assistant's last + # response. If not, Hermes feeds a continuation prompt back into the + # same session and keeps working until the goal is done, the turn + # budget is exhausted, or the user pauses/clears it. Judge failures + # fail OPEN (continue) so a flaky judge never wedges progress — the + # turn budget is the real backstop. + "goals": { + # Max continuation turns before Hermes auto-pauses the goal and + # asks the user to /goal resume. Protects against judge false + # negatives (goal actually done but judge says continue) and + # unbounded model spend on fuzzy / unachievable goals. + "max_turns": 20, + }, + # Skills — external skill directories for sharing skills across tools/agents. # Each path is expanded (~, ${VAR}) and resolved. Read-only — skill creation # always goes to ~/.hermes/skills/. diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py new file mode 100644 index 0000000000..0f0f3abd9c --- /dev/null +++ b/hermes_cli/goals.py @@ -0,0 +1,535 @@ +"""Persistent session goals — the Ralph loop for Hermes. + +A goal is a free-form user objective that stays active across turns. After +each turn completes, a small judge call asks an auxiliary model "is this +goal satisfied by the assistant's last response?". If not, Hermes feeds a +continuation prompt back into the same session and keeps working until the +goal is done, turn budget is exhausted, the user pauses/clears it, or the +user sends a new message (which takes priority and pauses the goal loop). + +State is persisted in SessionDB's ``state_meta`` table keyed by +``goal:`` so ``/resume`` picks it up. + +Design notes / invariants: + +- The continuation prompt is just a normal user message appended to the + session via ``run_conversation``. No system-prompt mutation, no toolset + swap — prompt caching stays intact. +- Judge failures are fail-OPEN: ``continue``. A broken judge must not wedge + progress; the turn budget is the backstop. +- When a real user message arrives mid-loop it preempts the continuation + prompt and also pauses the goal loop for that turn (we still re-judge + after, so if the user's message happens to complete the goal the judge + will say ``done``). +- This module has zero hard dependency on ``cli.HermesCLI`` or the gateway + runner — both wire the same ``GoalManager`` in. + +Nothing in this module touches the agent's system prompt or toolset. +""" + +from __future__ import annotations + +import json +import logging +import re +import time +from dataclasses import dataclass, asdict +from typing import Any, Dict, Optional, Tuple + +logger = logging.getLogger(__name__) + + +# ────────────────────────────────────────────────────────────────────── +# Constants & defaults +# ────────────────────────────────────────────────────────────────────── + +DEFAULT_MAX_TURNS = 20 +DEFAULT_JUDGE_TIMEOUT = 30.0 +# Cap how much of the last response + recent messages we send to the judge. +_JUDGE_RESPONSE_SNIPPET_CHARS = 4000 + + +CONTINUATION_PROMPT_TEMPLATE = ( + "[Continuing toward your standing goal]\n" + "Goal: {goal}\n\n" + "Continue working toward this goal. Take the next concrete step. " + "If you believe the goal is complete, state so explicitly and stop. " + "If you are blocked and need input from the user, say so clearly and stop." +) + + +JUDGE_SYSTEM_PROMPT = ( + "You are a strict judge evaluating whether an autonomous agent has " + "achieved a user's stated goal. You receive the goal text and the " + "agent's most recent response. Your only job is to decide whether " + "the goal is fully satisfied based on that response.\n\n" + "A goal is DONE only when:\n" + "- The response explicitly confirms the goal was completed, OR\n" + "- The response clearly shows the final deliverable was produced, OR\n" + "- The response explains the goal is unachievable / blocked / needs " + "user input (treat this as DONE with reason describing the block).\n\n" + "Otherwise the goal is NOT done — CONTINUE.\n\n" + "Reply ONLY with a single JSON object on one line:\n" + '{\"done\": , \"reason\": \"\"}' +) + + +JUDGE_USER_PROMPT_TEMPLATE = ( + "Goal:\n{goal}\n\n" + "Agent's most recent response:\n{response}\n\n" + "Is the goal satisfied?" +) + + +# ────────────────────────────────────────────────────────────────────── +# Dataclass +# ────────────────────────────────────────────────────────────────────── + + +@dataclass +class GoalState: + """Serializable goal state stored per session.""" + + goal: str + status: str = "active" # active | paused | done | cleared + turns_used: int = 0 + max_turns: int = DEFAULT_MAX_TURNS + created_at: float = 0.0 + last_turn_at: float = 0.0 + last_verdict: Optional[str] = None # "done" | "continue" | "skipped" + last_reason: Optional[str] = None + paused_reason: Optional[str] = None # why we auto-paused (budget, etc.) + + def to_json(self) -> str: + return json.dumps(asdict(self), ensure_ascii=False) + + @classmethod + def from_json(cls, raw: str) -> "GoalState": + data = json.loads(raw) + return cls( + goal=data.get("goal", ""), + status=data.get("status", "active"), + turns_used=int(data.get("turns_used", 0) or 0), + max_turns=int(data.get("max_turns", DEFAULT_MAX_TURNS) or DEFAULT_MAX_TURNS), + created_at=float(data.get("created_at", 0.0) or 0.0), + last_turn_at=float(data.get("last_turn_at", 0.0) or 0.0), + last_verdict=data.get("last_verdict"), + last_reason=data.get("last_reason"), + paused_reason=data.get("paused_reason"), + ) + + +# ────────────────────────────────────────────────────────────────────── +# Persistence (SessionDB state_meta) +# ────────────────────────────────────────────────────────────────────── + + +def _meta_key(session_id: str) -> str: + return f"goal:{session_id}" + + +_DB_CACHE: Dict[str, Any] = {} + + +def _get_session_db() -> Optional[Any]: + """Return a SessionDB instance for the current HERMES_HOME. + + SessionDB has no built-in singleton, but opening a new connection per + /goal call would thrash the file. We cache one instance per + ``hermes_home`` path so profile switches still pick up the right DB. + Defensive against import/instantiation failures so tests and + non-standard launchers can still use the GoalManager. + """ + try: + from hermes_constants import get_hermes_home + from hermes_state import SessionDB + + home = str(get_hermes_home()) + except Exception as exc: # pragma: no cover + logger.debug("GoalManager: SessionDB bootstrap failed (%s)", exc) + return None + + cached = _DB_CACHE.get(home) + if cached is not None: + return cached + try: + db = SessionDB() + except Exception as exc: # pragma: no cover + logger.debug("GoalManager: SessionDB() raised (%s)", exc) + return None + _DB_CACHE[home] = db + return db + + +def load_goal(session_id: str) -> Optional[GoalState]: + """Load the goal for a session, or None if none exists.""" + if not session_id: + return None + db = _get_session_db() + if db is None: + return None + try: + raw = db.get_meta(_meta_key(session_id)) + except Exception as exc: + logger.debug("GoalManager: get_meta failed: %s", exc) + return None + if not raw: + return None + try: + return GoalState.from_json(raw) + except Exception as exc: + logger.warning("GoalManager: could not parse stored goal for %s: %s", session_id, exc) + return None + + +def save_goal(session_id: str, state: GoalState) -> None: + """Persist a goal to SessionDB. No-op if DB unavailable.""" + if not session_id: + return + db = _get_session_db() + if db is None: + return + try: + db.set_meta(_meta_key(session_id), state.to_json()) + except Exception as exc: + logger.debug("GoalManager: set_meta failed: %s", exc) + + +def clear_goal(session_id: str) -> None: + """Mark a goal cleared in the DB (preserved for audit, status=cleared).""" + state = load_goal(session_id) + if state is None: + return + state.status = "cleared" + save_goal(session_id, state) + + +# ────────────────────────────────────────────────────────────────────── +# Judge +# ────────────────────────────────────────────────────────────────────── + + +def _truncate(text: str, limit: int) -> str: + if not text: + return "" + if len(text) <= limit: + return text + return text[:limit] + "… [truncated]" + + +_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL) + + +def _parse_judge_response(raw: str) -> Tuple[bool, str]: + """Parse the judge's reply. Fail-open to ``(False, "")``. + + Returns ``(done, reason)``. + """ + if not raw: + return False, "judge returned empty response" + + text = raw.strip() + + # Strip markdown code fences the model may wrap JSON in. + if text.startswith("```"): + text = text.strip("`") + # Peel off leading json/JSON/etc tag + nl = text.find("\n") + if nl != -1: + text = text[nl + 1:] + + # First try: parse the whole blob. + data: Optional[Dict[str, Any]] = None + try: + data = json.loads(text) + except Exception: + # Second try: pull the first JSON object out. + match = _JSON_OBJECT_RE.search(text) + if match: + try: + data = json.loads(match.group(0)) + except Exception: + data = None + + if not isinstance(data, dict): + return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}" + + done_val = data.get("done") + if isinstance(done_val, str): + done = done_val.strip().lower() in ("true", "yes", "1", "done") + else: + done = bool(done_val) + reason = str(data.get("reason") or "").strip() + if not reason: + reason = "no reason provided" + return done, reason + + +def judge_goal( + goal: str, + last_response: str, + *, + timeout: float = DEFAULT_JUDGE_TIMEOUT, +) -> Tuple[str, str]: + """Ask the auxiliary model whether the goal is satisfied. + + Returns ``(verdict, reason)`` where verdict is ``"done"``, ``"continue"``, + or ``"skipped"`` (when the judge couldn't be reached). + + This is deliberately fail-open: any error returns ``("continue", "...")`` + so a broken judge doesn't wedge progress — the turn budget is the + backstop. + """ + if not goal.strip(): + return "skipped", "empty goal" + if not last_response.strip(): + # No substantive reply this turn — almost certainly not done yet. + return "continue", "empty response (nothing to evaluate)" + + try: + from agent.auxiliary_client import get_text_auxiliary_client + except Exception as exc: + logger.debug("goal judge: auxiliary client import failed: %s", exc) + return "continue", "auxiliary client unavailable" + + try: + client, model = get_text_auxiliary_client("goal_judge") + except Exception as exc: + logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc) + return "continue", "auxiliary client unavailable" + + if client is None or not model: + return "continue", "no auxiliary client configured" + + prompt = JUDGE_USER_PROMPT_TEMPLATE.format( + goal=_truncate(goal, 2000), + response=_truncate(last_response, _JUDGE_RESPONSE_SNIPPET_CHARS), + ) + + try: + resp = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": JUDGE_SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ], + temperature=0, + max_tokens=200, + timeout=timeout, + ) + except Exception as exc: + logger.info("goal judge: API call failed (%s) — falling through to continue", exc) + return "continue", f"judge error: {type(exc).__name__}" + + try: + raw = resp.choices[0].message.content or "" + except Exception: + raw = "" + + done, reason = _parse_judge_response(raw) + verdict = "done" if done else "continue" + logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120)) + return verdict, reason + + +# ────────────────────────────────────────────────────────────────────── +# GoalManager — the orchestration surface CLI + gateway talk to +# ────────────────────────────────────────────────────────────────────── + + +class GoalManager: + """Per-session goal state + continuation decisions. + + The CLI and gateway each hold one ``GoalManager`` per live session. + + Methods: + + - ``set(goal)`` — start a new standing goal. + - ``clear()`` — remove the active goal. + - ``pause()`` / ``resume()`` — explicit user controls. + - ``status()`` — printable one-liner. + - ``evaluate_after_turn(last_response)`` — call the judge, update state, + and return a decision dict the caller uses to drive the next turn. + - ``next_continuation_prompt()`` — the canonical user-role message to + feed back into ``run_conversation``. + """ + + def __init__(self, session_id: str, *, default_max_turns: int = DEFAULT_MAX_TURNS): + self.session_id = session_id + self.default_max_turns = int(default_max_turns or DEFAULT_MAX_TURNS) + self._state: Optional[GoalState] = load_goal(session_id) + + # --- introspection ------------------------------------------------ + + @property + def state(self) -> Optional[GoalState]: + return self._state + + def is_active(self) -> bool: + return self._state is not None and self._state.status == "active" + + def has_goal(self) -> bool: + return self._state is not None and self._state.status in ("active", "paused") + + def status_line(self) -> str: + s = self._state + if s is None or s.status in ("cleared",): + return "No active goal. Set one with /goal ." + turns = f"{s.turns_used}/{s.max_turns} turns" + if s.status == "active": + return f"⊙ Goal (active, {turns}): {s.goal}" + if s.status == "paused": + extra = f" — {s.paused_reason}" if s.paused_reason else "" + return f"⏸ Goal (paused, {turns}{extra}): {s.goal}" + if s.status == "done": + return f"✓ Goal done ({turns}): {s.goal}" + return f"Goal ({s.status}, {turns}): {s.goal}" + + # --- mutation ----------------------------------------------------- + + def set(self, goal: str, *, max_turns: Optional[int] = None) -> GoalState: + goal = (goal or "").strip() + if not goal: + raise ValueError("goal text is empty") + state = GoalState( + goal=goal, + status="active", + turns_used=0, + max_turns=int(max_turns) if max_turns else self.default_max_turns, + created_at=time.time(), + last_turn_at=0.0, + ) + self._state = state + save_goal(self.session_id, state) + return state + + def pause(self, reason: str = "user-paused") -> Optional[GoalState]: + if not self._state: + return None + self._state.status = "paused" + self._state.paused_reason = reason + save_goal(self.session_id, self._state) + return self._state + + def resume(self, *, reset_budget: bool = True) -> Optional[GoalState]: + if not self._state: + return None + self._state.status = "active" + self._state.paused_reason = None + if reset_budget: + self._state.turns_used = 0 + save_goal(self.session_id, self._state) + return self._state + + def clear(self) -> None: + if self._state is None: + return + self._state.status = "cleared" + save_goal(self.session_id, self._state) + self._state = None + + def mark_done(self, reason: str) -> None: + if not self._state: + return + self._state.status = "done" + self._state.last_verdict = "done" + self._state.last_reason = reason + save_goal(self.session_id, self._state) + + # --- the main entry point called after every turn ----------------- + + def evaluate_after_turn( + self, + last_response: str, + *, + user_initiated: bool = True, + ) -> Dict[str, Any]: + """Run the judge and update state. Return a decision dict. + + ``user_initiated`` distinguishes a real user prompt (True) from a + continuation prompt we fed ourselves (False). Both increment + ``turns_used`` because both consume model budget. + + Decision keys: + - ``status``: current goal status after update + - ``should_continue``: bool — caller should fire another turn + - ``continuation_prompt``: str or None + - ``verdict``: "done" | "continue" | "skipped" | "inactive" + - ``reason``: str + - ``message``: user-visible one-liner to print/send + """ + state = self._state + if state is None or state.status != "active": + return { + "status": state.status if state else None, + "should_continue": False, + "continuation_prompt": None, + "verdict": "inactive", + "reason": "no active goal", + "message": "", + } + + # Count the turn that just finished. + state.turns_used += 1 + state.last_turn_at = time.time() + + verdict, reason = judge_goal(state.goal, last_response) + state.last_verdict = verdict + state.last_reason = reason + + if verdict == "done": + state.status = "done" + save_goal(self.session_id, state) + return { + "status": "done", + "should_continue": False, + "continuation_prompt": None, + "verdict": "done", + "reason": reason, + "message": f"✓ Goal achieved: {reason}", + } + + if state.turns_used >= state.max_turns: + state.status = "paused" + state.paused_reason = f"turn budget exhausted ({state.turns_used}/{state.max_turns})" + save_goal(self.session_id, state) + return { + "status": "paused", + "should_continue": False, + "continuation_prompt": None, + "verdict": "continue", + "reason": reason, + "message": ( + f"⏸ Goal paused — {state.turns_used}/{state.max_turns} turns used. " + "Use /goal resume to keep going, or /goal clear to stop." + ), + } + + save_goal(self.session_id, state) + return { + "status": "active", + "should_continue": True, + "continuation_prompt": self.next_continuation_prompt(), + "verdict": "continue", + "reason": reason, + "message": ( + f"↻ Continuing toward goal ({state.turns_used}/{state.max_turns}): {reason}" + ), + } + + def next_continuation_prompt(self) -> Optional[str]: + if not self._state or self._state.status != "active": + return None + return CONTINUATION_PROMPT_TEMPLATE.format(goal=self._state.goal) + + +__all__ = [ + "GoalState", + "GoalManager", + "CONTINUATION_PROMPT_TEMPLATE", + "DEFAULT_MAX_TURNS", + "load_goal", + "save_goal", + "clear_goal", + "judge_goal", +] diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 570a0a7a88..cbe9adb066 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -345,6 +345,7 @@ _CATEGORY_MERGE: Dict[str, str] = { "dashboard": "display", "code_execution": "agent", "prompt_caching": "agent", + "goals": "agent", # Only `telegram.reactions` currently lives under telegram — fold it in # with the other messaging-platform config (discord) so it isn't an # orphan tab of one field. diff --git a/tests/hermes_cli/test_goals.py b/tests/hermes_cli/test_goals.py new file mode 100644 index 0000000000..a21c5f4749 --- /dev/null +++ b/tests/hermes_cli/test_goals.py @@ -0,0 +1,358 @@ +"""Tests for hermes_cli/goals.py — persistent cross-turn goals.""" + +from __future__ import annotations + +import json +from unittest.mock import patch, MagicMock + +import pytest + + +# ────────────────────────────────────────────────────────────────────── +# Fixtures +# ────────────────────────────────────────────────────────────────────── + + +@pytest.fixture +def hermes_home(tmp_path, monkeypatch): + """Isolated HERMES_HOME so SessionDB.state_meta writes don't clobber the real one.""" + from pathlib import Path + + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setattr(Path, "home", lambda: tmp_path) + monkeypatch.setenv("HERMES_HOME", str(home)) + + # Bust the goal-module's DB cache for each test so it re-resolves HERMES_HOME. + from hermes_cli import goals + + goals._DB_CACHE.clear() + yield home + goals._DB_CACHE.clear() + + +# ────────────────────────────────────────────────────────────────────── +# _parse_judge_response +# ────────────────────────────────────────────────────────────────────── + + +class TestParseJudgeResponse: + def test_clean_json_done(self): + from hermes_cli.goals import _parse_judge_response + + done, reason = _parse_judge_response('{"done": true, "reason": "all good"}') + assert done is True + assert reason == "all good" + + def test_clean_json_continue(self): + from hermes_cli.goals import _parse_judge_response + + done, reason = _parse_judge_response('{"done": false, "reason": "more work needed"}') + assert done is False + assert reason == "more work needed" + + def test_json_in_markdown_fence(self): + from hermes_cli.goals import _parse_judge_response + + raw = '```json\n{"done": true, "reason": "done"}\n```' + done, reason = _parse_judge_response(raw) + assert done is True + assert "done" in reason + + def test_json_embedded_in_prose(self): + """Some models prefix reasoning before emitting JSON — we extract it.""" + from hermes_cli.goals import _parse_judge_response + + raw = 'Looking at this... the agent says X. Verdict: {"done": false, "reason": "partial"}' + done, reason = _parse_judge_response(raw) + assert done is False + assert reason == "partial" + + def test_string_done_values(self): + from hermes_cli.goals import _parse_judge_response + + for s in ("true", "yes", "done", "1"): + done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}') + assert done is True + for s in ("false", "no", "not yet"): + done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}') + assert done is False + + def test_malformed_json_fails_open(self): + """Non-JSON → not done, with error-ish reason (so judge_goal can map to continue).""" + from hermes_cli.goals import _parse_judge_response + + done, reason = _parse_judge_response("this is not json at all") + assert done is False + assert reason # non-empty + + def test_empty_response(self): + from hermes_cli.goals import _parse_judge_response + + done, reason = _parse_judge_response("") + assert done is False + assert reason + + +# ────────────────────────────────────────────────────────────────────── +# judge_goal — fail-open semantics +# ────────────────────────────────────────────────────────────────────── + + +class TestJudgeGoal: + def test_empty_goal_skipped(self): + from hermes_cli.goals import judge_goal + + verdict, _ = judge_goal("", "some response") + assert verdict == "skipped" + + def test_empty_response_continues(self): + from hermes_cli.goals import judge_goal + + verdict, _ = judge_goal("ship the thing", "") + assert verdict == "continue" + + def test_no_aux_client_continues(self): + """Fail-open: if no aux client, we must return continue, not skipped/done.""" + from hermes_cli import goals + + with patch( + "agent.auxiliary_client.get_text_auxiliary_client", + return_value=(None, None), + ): + verdict, _ = goals.judge_goal("my goal", "my response") + assert verdict == "continue" + + def test_api_error_continues(self): + """Judge exception → fail-open continue (don't wedge progress on judge bugs).""" + from hermes_cli import goals + + fake_client = MagicMock() + fake_client.chat.completions.create.side_effect = RuntimeError("boom") + with patch( + "agent.auxiliary_client.get_text_auxiliary_client", + return_value=(fake_client, "judge-model"), + ): + verdict, reason = goals.judge_goal("goal", "response") + assert verdict == "continue" + assert "judge error" in reason.lower() + + def test_judge_says_done(self): + from hermes_cli import goals + + fake_client = MagicMock() + fake_client.chat.completions.create.return_value = MagicMock( + choices=[ + MagicMock( + message=MagicMock(content='{"done": true, "reason": "achieved"}') + ) + ] + ) + with patch( + "agent.auxiliary_client.get_text_auxiliary_client", + return_value=(fake_client, "judge-model"), + ): + verdict, reason = goals.judge_goal("goal", "agent response") + assert verdict == "done" + assert reason == "achieved" + + def test_judge_says_continue(self): + from hermes_cli import goals + + fake_client = MagicMock() + fake_client.chat.completions.create.return_value = MagicMock( + choices=[ + MagicMock( + message=MagicMock(content='{"done": false, "reason": "not yet"}') + ) + ] + ) + with patch( + "agent.auxiliary_client.get_text_auxiliary_client", + return_value=(fake_client, "judge-model"), + ): + verdict, reason = goals.judge_goal("goal", "agent response") + assert verdict == "continue" + assert reason == "not yet" + + +# ────────────────────────────────────────────────────────────────────── +# GoalManager lifecycle + persistence +# ────────────────────────────────────────────────────────────────────── + + +class TestGoalManager: + def test_no_goal_initial(self, hermes_home): + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="test-sid-1") + assert mgr.state is None + assert not mgr.is_active() + assert not mgr.has_goal() + assert "No active goal" in mgr.status_line() + + def test_set_then_status(self, hermes_home): + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="test-sid-2", default_max_turns=5) + state = mgr.set("port the thing") + assert state.goal == "port the thing" + assert state.status == "active" + assert state.max_turns == 5 + assert state.turns_used == 0 + assert mgr.is_active() + assert "active" in mgr.status_line().lower() + assert "port the thing" in mgr.status_line() + + def test_set_rejects_empty(self, hermes_home): + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="test-sid-3") + with pytest.raises(ValueError): + mgr.set("") + with pytest.raises(ValueError): + mgr.set(" ") + + def test_pause_and_resume(self, hermes_home): + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="test-sid-4") + mgr.set("goal text") + mgr.pause(reason="user-paused") + assert mgr.state.status == "paused" + assert not mgr.is_active() + assert mgr.has_goal() + + mgr.resume() + assert mgr.state.status == "active" + assert mgr.is_active() + + def test_clear(self, hermes_home): + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="test-sid-5") + mgr.set("goal") + mgr.clear() + assert mgr.state is None + assert not mgr.is_active() + + def test_persistence_across_managers(self, hermes_home): + """Key invariant: a second manager on the same session sees the goal. + + This is what makes /resume work — each session rebinds its + GoalManager and picks up the saved state. + """ + from hermes_cli.goals import GoalManager + + mgr1 = GoalManager(session_id="persist-sid") + mgr1.set("do the thing") + + mgr2 = GoalManager(session_id="persist-sid") + assert mgr2.state is not None + assert mgr2.state.goal == "do the thing" + assert mgr2.is_active() + + def test_evaluate_after_turn_done(self, hermes_home): + """Judge says done → status=done, no continuation.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="eval-sid-1") + mgr.set("ship it") + + with patch.object(goals, "judge_goal", return_value=("done", "shipped")): + decision = mgr.evaluate_after_turn("I shipped the feature.") + + assert decision["verdict"] == "done" + assert decision["should_continue"] is False + assert decision["continuation_prompt"] is None + assert mgr.state.status == "done" + assert mgr.state.turns_used == 1 + + def test_evaluate_after_turn_continue_under_budget(self, hermes_home): + from hermes_cli import goals + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5) + mgr.set("a long goal") + + with patch.object(goals, "judge_goal", return_value=("continue", "more work")): + decision = mgr.evaluate_after_turn("made some progress") + + assert decision["verdict"] == "continue" + assert decision["should_continue"] is True + assert decision["continuation_prompt"] is not None + assert "a long goal" in decision["continuation_prompt"] + assert mgr.state.status == "active" + assert mgr.state.turns_used == 1 + + def test_evaluate_after_turn_budget_exhausted(self, hermes_home): + """When turn budget hits ceiling, auto-pause instead of continuing.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2) + mgr.set("hard goal") + + with patch.object(goals, "judge_goal", return_value=("continue", "not yet")): + d1 = mgr.evaluate_after_turn("step 1") + assert d1["should_continue"] is True + assert mgr.state.turns_used == 1 + assert mgr.state.status == "active" + + d2 = mgr.evaluate_after_turn("step 2") + # turns_used is now 2 which equals max_turns → paused + assert d2["should_continue"] is False + assert mgr.state.status == "paused" + assert mgr.state.turns_used == 2 + assert "budget" in (mgr.state.paused_reason or "").lower() + + def test_evaluate_after_turn_inactive(self, hermes_home): + """evaluate_after_turn is a no-op when goal isn't active.""" + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="eval-sid-4") + d = mgr.evaluate_after_turn("anything") + assert d["verdict"] == "inactive" + assert d["should_continue"] is False + + mgr.set("a goal") + mgr.pause() + d2 = mgr.evaluate_after_turn("anything") + assert d2["verdict"] == "inactive" + assert d2["should_continue"] is False + + def test_continuation_prompt_shape(self, hermes_home): + """The continuation prompt must include the goal text verbatim — + and must be safe to inject as a user-role message (prompt-cache + invariants: no system-prompt mutation).""" + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="cont-sid") + mgr.set("port goal command to hermes") + prompt = mgr.next_continuation_prompt() + assert prompt is not None + assert "port goal command to hermes" in prompt + assert prompt.strip() # non-empty + + +# ────────────────────────────────────────────────────────────────────── +# Smoke: CommandDef is wired +# ────────────────────────────────────────────────────────────────────── + + +def test_goal_command_in_registry(): + from hermes_cli.commands import resolve_command + + cmd = resolve_command("goal") + assert cmd is not None + assert cmd.name == "goal" + + +def test_goal_command_dispatches_in_cli_registry_helpers(): + """goal shows up in autocomplete / help categories alongside other Session cmds.""" + from hermes_cli.commands import COMMANDS, COMMANDS_BY_CATEGORY + + assert "/goal" in COMMANDS + session_cmds = COMMANDS_BY_CATEGORY.get("Session", {}) + assert "/goal" in session_cmds diff --git a/website/docs/reference/slash-commands.md b/website/docs/reference/slash-commands.md index 6cc37287cb..e70a923a92 100644 --- a/website/docs/reference/slash-commands.md +++ b/website/docs/reference/slash-commands.md @@ -34,6 +34,7 @@ Type `/` in the CLI to open the autocomplete menu. Built-in commands are case-in | `/stop` | Kill all running background processes | | `/queue ` (alias: `/q`) | Queue a prompt for the next turn (doesn't interrupt the current agent response). | | `/steer ` | Inject a mid-run note that arrives at the agent **after the next tool call** — no interrupt, no new user turn. The text is appended to the last tool result's content once the current tool completes, giving the agent new context without breaking the current tool-calling loop. Use this to nudge direction mid-task (e.g. "focus on the auth module" while the agent is running tests). | +| `/goal ` | Set a standing goal Hermes works toward across turns. After each turn an auxiliary model judges whether the goal is satisfied by the agent's last response; if not, Hermes automatically feeds a continuation prompt back into the same session and keeps working. Subcommands: `/goal` (status), `/goal status`, `/goal pause`, `/goal resume`, `/goal clear`. Budget defaults to 20 turns (`goals.max_turns` in `config.yaml`); any real user message preempts the continuation loop. Our take on the Ralph loop — state survives `/resume` because it's stored in `state_meta` keyed by session ID. | | `/resume [name]` | Resume a previously-named session | | `/redraw` | Force a full UI repaint (recovers from terminal drift after tmux resize, mouse selection artifacts, etc.) | | `/status` | Show session info | @@ -153,6 +154,7 @@ The messaging gateway supports the following built-in commands inside Telegram, | `/background ` | Run a prompt in a separate background session. Results are delivered back to the same chat when the task finishes. See [Messaging Background Sessions](/docs/user-guide/messaging/#background-sessions). | | `/queue ` (alias: `/q`) | Queue a prompt for the next turn without interrupting the current one. | | `/steer ` | Inject a message after the next tool call without interrupting — the model picks it up on its next iteration rather than as a new turn. | +| `/goal ` | Set a standing goal Hermes works toward across turns. A judge model checks after each turn whether the goal is satisfied; if not, Hermes auto-continues until it is, you pause/clear it, or the turn budget (default 20) is hit. Subcommands: `/goal status`, `/goal pause`, `/goal resume`, `/goal clear`. Safe to run mid-agent for status/pause/clear; setting a new goal requires `/stop` first. | | `/footer [on\|off\|status]` | Toggle the runtime-metadata footer on final replies (shows model, tool counts, timing). | | `/curator [status\|run\|pin\|archive]` | Background skill maintenance controls. | | `/reload-mcp` (alias: `/reload_mcp`) | Reload MCP servers from config. |