mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
* feat(goals): /goal checklist + /subgoal user controls
Two-phase judge for /goal — Phase A decomposes the goal into a detailed
checklist on first turn; Phase B evaluates each pending item harshly
against the agent's most recent response. The goal completes only when
every item is in a terminal status (completed or impossible). Adds
/subgoal so the user can append, complete, mark impossible, undo,
remove, or clear items the judge missed or got wrong.
Mechanics:
- GoalState gains `checklist` and `decomposed` fields, both backwards
compatible (old state_meta rows load unchanged).
- Phase A: aux call writes a harsh, exhaustive checklist; biased toward
more items not fewer. Falls through to legacy freeform judge when
decompose fails.
- Phase B: judge gets the checklist + last-response snippet + path to
a per-session conversation dump at <HERMES_HOME>/goals/<sid>.json.
A bounded read_file tool (max 5 calls per turn, restricted to that
one file) lets the judge inspect history when the snippet is
ambiguous. Stickiness in code: terminal items are frozen, only the
user can revert via /subgoal undo.
- Continuation prompt shows checklist progress when non-empty;
reverts to old prompt when empty.
- Status line shows M/N done counts.
CLI + gateway + TUI gateway all pass the agent reference into
evaluate_after_turn so the dump can be written. Gateway-side
/subgoal is allowed mid-run since it only modifies the checklist
the judge consults at turn boundaries.
Tests: 24 new cases — backcompat round-trip, Phase A decompose,
Phase B updates + new_items + stickiness, user override flows,
conversation dump (incl. unsafe-sid sanitization), judge read_file
restriction. Existing freeform-mode tests updated to patch the
renamed `judge_goal_freeform` and skip Phase A explicitly.
* fix(goals): off-by-one in judge index, message-list plumbing, prompt tuning
Three live-test findings from running /goal end-to-end against
gemini-3-flash-preview as the judge:
1. Off-by-one bug — the judge sees the checklist rendered with 1-based
indices ('1. [ ] foo, 2. [ ] bar') but the apply layer indexed
state.checklist as 0-based. Result: every judge update landed on
the wrong item, evidence got attached to neighbouring rows, and
the genuine 'first pending' item (usually #1) never got marked.
Fix: convert 1 → 0 in _parse_evaluate_response. Also tightened the
user prompt to call out the 1-based scheme explicitly. New tests
cover the parser conversion + an end-to-end fake-judge round-trip.
2. Conversation dump never happened — _extract_agent_messages tried
common AIAgent attribute names (.messages, .conversation_history,
etc.) but AIAgent doesn't expose the message list as an instance
attribute; it lives inside run_conversation()'s scope. Result: the
judge's read_file tool always saw history_path=unavailable. Fix:
added an explicit messages= kwarg to evaluate_after_turn that all
three call sites (CLI, gateway, TUI gateway) now pass directly.
Agent-attribute extraction kept as back-compat fallback.
3. Prompt was too harsh on simple goals. The original 'be HARSH,
default to leaving items pending' wording made the judge refuse
to mark 'file exists' completed even after the agent ran ls,
test -f, os.path.isfile, and find — burning the entire 8-turn
budget on a fizzbuzz task. Softened to 'strict but not absurd'
with explicit guidance on what counts as evidence and a directive
not to require re-proving items already established earlier.
Re-tested live with the same fizzbuzz goal: now terminates in 2
turns with all 8 checklist items correctly attributed to their
own evidence. /subgoal user-action flow (add / complete / undo /
impossible) verified live as well.
227 lines
7.7 KiB
Python
227 lines
7.7 KiB
Python
"""Tests for gateway /goal verdict-message delivery.
|
|
|
|
The judge verdict message ("✓ Goal achieved", "⏸ budget exhausted", etc.)
|
|
must reach the user after each turn. Before this fix the code checked
|
|
``hasattr(adapter, "send_message")`` — but adapters expose ``send()``,
|
|
never ``send_message``, so the check always evaluated False and users
|
|
never saw verdicts. This test locks in the fix.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
|
from gateway.session import SessionEntry, SessionSource, build_session_key
|
|
|
|
|
|
@pytest.fixture()
|
|
def hermes_home(tmp_path, monkeypatch):
|
|
home = tmp_path / ".hermes"
|
|
home.mkdir()
|
|
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
|
monkeypatch.setenv("HERMES_HOME", str(home))
|
|
|
|
from hermes_cli import goals
|
|
|
|
goals._DB_CACHE.clear()
|
|
yield home
|
|
goals._DB_CACHE.clear()
|
|
|
|
|
|
def _make_source() -> SessionSource:
|
|
return SessionSource(
|
|
platform=Platform.TELEGRAM,
|
|
user_id="u1",
|
|
chat_id="c1",
|
|
user_name="tester",
|
|
chat_type="dm",
|
|
)
|
|
|
|
|
|
class _RecordingAdapter:
|
|
"""Minimal adapter that records send() invocations."""
|
|
|
|
def __init__(self) -> None:
|
|
self._pending_messages: dict = {}
|
|
self.sends: list[dict] = []
|
|
|
|
async def send(self, chat_id: str, content: str, reply_to=None, metadata=None):
|
|
self.sends.append({"chat_id": chat_id, "content": content, "metadata": metadata})
|
|
|
|
class _R:
|
|
success = True
|
|
message_id = "mock-msg"
|
|
|
|
return _R()
|
|
|
|
|
|
def _make_runner_with_adapter(session_id: str = None):
|
|
from gateway.run import GatewayRunner
|
|
import uuid
|
|
|
|
runner = object.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(
|
|
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")},
|
|
)
|
|
runner.adapters = {}
|
|
runner._running_agents = {}
|
|
runner._running_agents_ts = {}
|
|
runner._queued_events = {}
|
|
|
|
src = _make_source()
|
|
# Default to a unique session_id so xdist parallel runs on the same worker
|
|
# don't see each other's GoalManager state (DEFAULT_DB_PATH gets frozen at
|
|
# module-import time, defeating per-test HERMES_HOME monkeypatches).
|
|
session_entry = SessionEntry(
|
|
session_key=build_session_key(src),
|
|
session_id=session_id or f"goal-sess-{uuid.uuid4().hex[:8]}",
|
|
created_at=datetime.now(),
|
|
updated_at=datetime.now(),
|
|
platform=Platform.TELEGRAM,
|
|
chat_type="dm",
|
|
)
|
|
|
|
runner.session_store = MagicMock()
|
|
runner.session_store.get_or_create_session.return_value = session_entry
|
|
runner.session_store._generate_session_key.return_value = build_session_key(src)
|
|
|
|
adapter = _RecordingAdapter()
|
|
runner.adapters[Platform.TELEGRAM] = adapter
|
|
return runner, adapter, session_entry, src
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_goal_verdict_done_sent_via_adapter_send(hermes_home):
|
|
"""When the judge says done, the '✓ Goal achieved' message must reach
|
|
the user through the adapter's ``send()`` method."""
|
|
runner, adapter, session_entry, src = _make_runner_with_adapter()
|
|
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_entry.session_id)
|
|
mgr.set("ship the feature")
|
|
mgr.state.decomposed = True
|
|
from hermes_cli.goals import save_goal as _sg
|
|
_sg(mgr.session_id, mgr.state)
|
|
|
|
with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "the feature shipped", False)):
|
|
await runner._post_turn_goal_continuation(
|
|
session_entry=session_entry,
|
|
source=src,
|
|
final_response="I shipped the feature.",
|
|
)
|
|
# fire-and-forget create_task — give the loop a tick
|
|
await asyncio.sleep(0.05)
|
|
|
|
assert len(adapter.sends) == 1, f"expected 1 send, got {len(adapter.sends)}: {adapter.sends}"
|
|
msg = adapter.sends[0]
|
|
assert msg["chat_id"] == "c1"
|
|
assert "Goal achieved" in msg["content"]
|
|
assert "the feature shipped" in msg["content"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_goal_verdict_continue_enqueues_continuation(hermes_home):
|
|
"""When the judge says continue, both the 'continuing' status and the
|
|
continuation-prompt event must be delivered. The continuation prompt is
|
|
routed through the adapter's pending-messages FIFO so the goal loop
|
|
proceeds on the next turn."""
|
|
runner, adapter, session_entry, src = _make_runner_with_adapter()
|
|
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_entry.session_id)
|
|
mgr.set("polish the docs")
|
|
mgr.state.decomposed = True
|
|
from hermes_cli.goals import save_goal as _sg
|
|
_sg(mgr.session_id, mgr.state)
|
|
|
|
with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "still needs work", False)):
|
|
await runner._post_turn_goal_continuation(
|
|
session_entry=session_entry,
|
|
source=src,
|
|
final_response="here's a partial edit",
|
|
)
|
|
await asyncio.sleep(0.05)
|
|
|
|
# Status line sent back
|
|
assert len(adapter.sends) == 1
|
|
assert "Continuing toward goal" in adapter.sends[0]["content"]
|
|
# Continuation prompt enqueued for next turn
|
|
assert adapter._pending_messages, "continuation prompt must be enqueued in pending_messages"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home):
|
|
"""When the budget is exhausted, a '⏸ Goal paused' message must be sent
|
|
and no further continuation enqueued."""
|
|
runner, adapter, session_entry, src = _make_runner_with_adapter()
|
|
|
|
from hermes_cli.goals import GoalManager, save_goal
|
|
|
|
mgr = GoalManager(session_entry.session_id, default_max_turns=2)
|
|
state = mgr.set("tiny goal", max_turns=2)
|
|
state.turns_used = 2
|
|
save_goal(session_entry.session_id, state)
|
|
|
|
with patch("hermes_cli.goals.judge_goal_freeform", return_value=("continue", "keep going", False)):
|
|
await runner._post_turn_goal_continuation(
|
|
session_entry=session_entry,
|
|
source=src,
|
|
final_response="still partial",
|
|
)
|
|
await asyncio.sleep(0.05)
|
|
|
|
assert len(adapter.sends) == 1
|
|
content = adapter.sends[0]["content"]
|
|
assert "paused" in content.lower()
|
|
assert "turns used" in content.lower()
|
|
# No continuation enqueued when budget is exhausted
|
|
assert not adapter._pending_messages
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_goal_verdict_skipped_when_no_active_goal(hermes_home):
|
|
"""No goal set → the hook is a no-op. Nothing is sent, nothing enqueued."""
|
|
runner, adapter, session_entry, src = _make_runner_with_adapter()
|
|
|
|
await runner._post_turn_goal_continuation(
|
|
session_entry=session_entry,
|
|
source=src,
|
|
final_response="anything",
|
|
)
|
|
await asyncio.sleep(0.05)
|
|
|
|
assert adapter.sends == []
|
|
assert adapter._pending_messages == {}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_goal_verdict_survives_adapter_without_send(hermes_home):
|
|
"""Bad adapter (no ``send`` attribute) must not crash the judge hook."""
|
|
runner, _adapter, session_entry, src = _make_runner_with_adapter()
|
|
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
GoalManager(session_entry.session_id).set("survive missing send")
|
|
|
|
class _NoSendAdapter:
|
|
def __init__(self):
|
|
self._pending_messages: dict = {}
|
|
|
|
runner.adapters[Platform.TELEGRAM] = _NoSendAdapter()
|
|
|
|
with patch("hermes_cli.goals.judge_goal_freeform", return_value=("done", "ok", False)):
|
|
# must not raise
|
|
await runner._post_turn_goal_continuation(
|
|
session_entry=session_entry,
|
|
source=src,
|
|
final_response="whatever",
|
|
)
|
|
await asyncio.sleep(0.05)
|