mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
* feat(goals): /subgoal — user-added criteria appended to active /goal Layers a /subgoal command on top of the existing freeform Ralph judge loop. The user can append extra criteria mid-loop; the judge factors them into its done/continue verdict and the continuation prompt surfaces them to the agent. No new tool, no agent self-judging — the existing judge model just sees a richer prompt. Forms: /subgoal show current subgoals /subgoal <text> append a criterion /subgoal remove <n> drop subgoal n (1-based) /subgoal clear wipe all subgoals How it integrates: - GoalState gains `subgoals: List[str]` (default []), backwards-compat for existing state_meta rows. - judge_goal accepts an optional subgoals kwarg; non-empty switches to JUDGE_USER_PROMPT_WITH_SUBGOALS_TEMPLATE which lists them as numbered criteria and asks 'is the goal AND every additional criterion satisfied?' - next_continuation_prompt picks CONTINUATION_PROMPT_WITH_SUBGOALS_TEMPLATE when non-empty so the agent sees what to target. - /subgoal is allowed mid-run on the gateway since it only touches the state the judge reads at turn boundary — no race with the running turn. - Status line shows '... , N subgoals' when present. Surface: - hermes_cli/goals.py — field, prompt blocks, manager methods, judge weave - hermes_cli/commands.py — /subgoal CommandDef - cli.py — _handle_subgoal_command - gateway/run.py — _handle_subgoal_command + mid-run dispatch - tests/hermes_cli/test_goals.py — 15 new tests (backcompat, mutation, persistence, prompt template selection, judge-prompt content via mock, status-line rendering) 77 goal-related tests passing across goals + cli + gateway + tui. * fix(goals): slash commands don't preempt the goal-continuation hook Two findings from live-testing /subgoal: 1. Slash commands queued while the agent is running landed in _pending_input (same queue as real user messages). The goal hook's 'is a real user message pending?' check returned True and silently skipped — but the slash command consumes its queue slot via process_command() which never re-fires the goal hook, so the loop stalls indefinitely. Now the hook peeks the queue and only defers when a non-slash payload is present. 2. The with-subgoals judge prompt was too soft — opus 4.7 said 'done, implying all requirements met' without verifying. Tightened to demand specific per-criterion evidence (file contents, output line, command result) and explicitly reject phrases like 'implying it was done.' Live verified: /subgoal injected mid-loop now correctly forces the judge to refuse done until the new criterion is met. Agent gets the continuation prompt with subgoals listed, updates the script, judge confirms done with specific evidence cited.
740 lines
29 KiB
Python
740 lines
29 KiB
Python
"""Tests for hermes_cli/goals.py — persistent cross-turn goals."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
import pytest
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Fixtures
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
@pytest.fixture
|
|
def hermes_home(tmp_path, monkeypatch):
|
|
"""Isolated HERMES_HOME so SessionDB.state_meta writes don't clobber the real one."""
|
|
from pathlib import Path
|
|
|
|
home = tmp_path / ".hermes"
|
|
home.mkdir()
|
|
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
|
monkeypatch.setenv("HERMES_HOME", str(home))
|
|
|
|
# Bust the goal-module's DB cache for each test so it re-resolves HERMES_HOME.
|
|
from hermes_cli import goals
|
|
|
|
goals._DB_CACHE.clear()
|
|
yield home
|
|
goals._DB_CACHE.clear()
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# _parse_judge_response
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestParseJudgeResponse:
|
|
def test_clean_json_done(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, _ = _parse_judge_response('{"done": true, "reason": "all good"}')
|
|
assert done is True
|
|
assert reason == "all good"
|
|
|
|
def test_clean_json_continue(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, _ = _parse_judge_response('{"done": false, "reason": "more work needed"}')
|
|
assert done is False
|
|
assert reason == "more work needed"
|
|
|
|
def test_json_in_markdown_fence(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
raw = '```json\n{"done": true, "reason": "done"}\n```'
|
|
done, reason, _ = _parse_judge_response(raw)
|
|
assert done is True
|
|
assert "done" in reason
|
|
|
|
def test_json_embedded_in_prose(self):
|
|
"""Some models prefix reasoning before emitting JSON — we extract it."""
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
raw = 'Looking at this... the agent says X. Verdict: {"done": false, "reason": "partial"}'
|
|
done, reason, _ = _parse_judge_response(raw)
|
|
assert done is False
|
|
assert reason == "partial"
|
|
|
|
def test_string_done_values(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
for s in ("true", "yes", "done", "1"):
|
|
done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
|
|
assert done is True
|
|
for s in ("false", "no", "not yet"):
|
|
done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
|
|
assert done is False
|
|
|
|
def test_malformed_json_fails_open(self):
|
|
"""Non-JSON → not done, with error-ish reason (so judge_goal can map to continue)."""
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, _ = _parse_judge_response("this is not json at all")
|
|
assert done is False
|
|
assert reason # non-empty
|
|
|
|
def test_empty_response(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, _ = _parse_judge_response("")
|
|
assert done is False
|
|
assert reason
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# judge_goal — fail-open semantics
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestJudgeGoal:
|
|
def test_empty_goal_skipped(self):
|
|
from hermes_cli.goals import judge_goal
|
|
|
|
verdict, _, _ = judge_goal("", "some response")
|
|
assert verdict == "skipped"
|
|
|
|
def test_empty_response_continues(self):
|
|
from hermes_cli.goals import judge_goal
|
|
|
|
verdict, _, _ = judge_goal("ship the thing", "")
|
|
assert verdict == "continue"
|
|
|
|
def test_no_aux_client_continues(self):
|
|
"""Fail-open: if no aux client, we must return continue, not skipped/done."""
|
|
from hermes_cli import goals
|
|
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(None, None),
|
|
):
|
|
verdict, _, _ = goals.judge_goal("my goal", "my response")
|
|
assert verdict == "continue"
|
|
|
|
def test_api_error_continues(self):
|
|
"""Judge exception → fail-open continue (don't wedge progress on judge bugs)."""
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.side_effect = RuntimeError("boom")
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, reason, _ = goals.judge_goal("goal", "response")
|
|
assert verdict == "continue"
|
|
assert "judge error" in reason.lower()
|
|
|
|
def test_judge_says_done(self):
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.return_value = MagicMock(
|
|
choices=[
|
|
MagicMock(
|
|
message=MagicMock(content='{"done": true, "reason": "achieved"}')
|
|
)
|
|
]
|
|
)
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, reason, _ = goals.judge_goal("goal", "agent response")
|
|
assert verdict == "done"
|
|
assert reason == "achieved"
|
|
|
|
def test_judge_says_continue(self):
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.return_value = MagicMock(
|
|
choices=[
|
|
MagicMock(
|
|
message=MagicMock(content='{"done": false, "reason": "not yet"}')
|
|
)
|
|
]
|
|
)
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, reason, _ = goals.judge_goal("goal", "agent response")
|
|
assert verdict == "continue"
|
|
assert reason == "not yet"
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# GoalManager lifecycle + persistence
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestGoalManager:
|
|
def test_no_goal_initial(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-1")
|
|
assert mgr.state is None
|
|
assert not mgr.is_active()
|
|
assert not mgr.has_goal()
|
|
assert "No active goal" in mgr.status_line()
|
|
|
|
def test_set_then_status(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-2", default_max_turns=5)
|
|
state = mgr.set("port the thing")
|
|
assert state.goal == "port the thing"
|
|
assert state.status == "active"
|
|
assert state.max_turns == 5
|
|
assert state.turns_used == 0
|
|
assert mgr.is_active()
|
|
assert "active" in mgr.status_line().lower()
|
|
assert "port the thing" in mgr.status_line()
|
|
|
|
def test_set_rejects_empty(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-3")
|
|
with pytest.raises(ValueError):
|
|
mgr.set("")
|
|
with pytest.raises(ValueError):
|
|
mgr.set(" ")
|
|
|
|
def test_pause_and_resume(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-4")
|
|
mgr.set("goal text")
|
|
mgr.pause(reason="user-paused")
|
|
assert mgr.state.status == "paused"
|
|
assert not mgr.is_active()
|
|
assert mgr.has_goal()
|
|
|
|
mgr.resume()
|
|
assert mgr.state.status == "active"
|
|
assert mgr.is_active()
|
|
|
|
def test_clear(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-5")
|
|
mgr.set("goal")
|
|
mgr.clear()
|
|
assert mgr.state is None
|
|
assert not mgr.is_active()
|
|
|
|
def test_persistence_across_managers(self, hermes_home):
|
|
"""Key invariant: a second manager on the same session sees the goal.
|
|
|
|
This is what makes /resume work — each session rebinds its
|
|
GoalManager and picks up the saved state.
|
|
"""
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr1 = GoalManager(session_id="persist-sid")
|
|
mgr1.set("do the thing")
|
|
|
|
mgr2 = GoalManager(session_id="persist-sid")
|
|
assert mgr2.state is not None
|
|
assert mgr2.state.goal == "do the thing"
|
|
assert mgr2.is_active()
|
|
|
|
def test_evaluate_after_turn_done(self, hermes_home):
|
|
"""Judge says done → status=done, no continuation."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="eval-sid-1")
|
|
mgr.set("ship it")
|
|
|
|
with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)):
|
|
decision = mgr.evaluate_after_turn("I shipped the feature.")
|
|
|
|
assert decision["verdict"] == "done"
|
|
assert decision["should_continue"] is False
|
|
assert decision["continuation_prompt"] is None
|
|
assert mgr.state.status == "done"
|
|
assert mgr.state.turns_used == 1
|
|
|
|
def test_evaluate_after_turn_continue_under_budget(self, hermes_home):
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
|
|
mgr.set("a long goal")
|
|
|
|
with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)):
|
|
decision = mgr.evaluate_after_turn("made some progress")
|
|
|
|
assert decision["verdict"] == "continue"
|
|
assert decision["should_continue"] is True
|
|
assert decision["continuation_prompt"] is not None
|
|
assert "a long goal" in decision["continuation_prompt"]
|
|
assert mgr.state.status == "active"
|
|
assert mgr.state.turns_used == 1
|
|
|
|
def test_evaluate_after_turn_budget_exhausted(self, hermes_home):
|
|
"""When turn budget hits ceiling, auto-pause instead of continuing."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
|
|
mgr.set("hard goal")
|
|
|
|
with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)):
|
|
d1 = mgr.evaluate_after_turn("step 1")
|
|
assert d1["should_continue"] is True
|
|
assert mgr.state.turns_used == 1
|
|
assert mgr.state.status == "active"
|
|
|
|
d2 = mgr.evaluate_after_turn("step 2")
|
|
# turns_used is now 2 which equals max_turns → paused
|
|
assert d2["should_continue"] is False
|
|
assert mgr.state.status == "paused"
|
|
assert mgr.state.turns_used == 2
|
|
assert "budget" in (mgr.state.paused_reason or "").lower()
|
|
|
|
def test_evaluate_after_turn_inactive(self, hermes_home):
|
|
"""evaluate_after_turn is a no-op when goal isn't active."""
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="eval-sid-4")
|
|
d = mgr.evaluate_after_turn("anything")
|
|
assert d["verdict"] == "inactive"
|
|
assert d["should_continue"] is False
|
|
|
|
mgr.set("a goal")
|
|
mgr.pause()
|
|
d2 = mgr.evaluate_after_turn("anything")
|
|
assert d2["verdict"] == "inactive"
|
|
assert d2["should_continue"] is False
|
|
|
|
def test_continuation_prompt_shape(self, hermes_home):
|
|
"""The continuation prompt must include the goal text verbatim —
|
|
and must be safe to inject as a user-role message (prompt-cache
|
|
invariants: no system-prompt mutation)."""
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="cont-sid")
|
|
mgr.set("port goal command to hermes")
|
|
prompt = mgr.next_continuation_prompt()
|
|
assert prompt is not None
|
|
assert "port goal command to hermes" in prompt
|
|
assert prompt.strip() # non-empty
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Smoke: CommandDef is wired
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_goal_command_in_registry():
|
|
from hermes_cli.commands import resolve_command
|
|
|
|
cmd = resolve_command("goal")
|
|
assert cmd is not None
|
|
assert cmd.name == "goal"
|
|
|
|
|
|
def test_goal_command_dispatches_in_cli_registry_helpers():
|
|
"""goal shows up in autocomplete / help categories alongside other Session cmds."""
|
|
from hermes_cli.commands import COMMANDS, COMMANDS_BY_CATEGORY
|
|
|
|
assert "/goal" in COMMANDS
|
|
session_cmds = COMMANDS_BY_CATEGORY.get("Session", {})
|
|
assert "/goal" in session_cmds
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Auto-pause on consecutive judge parse failures
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestJudgeParseFailureAutoPause:
|
|
"""Regression: weak judge models (e.g. deepseek-v4-flash) that return
|
|
empty strings or non-JSON prose must auto-pause the loop after N turns
|
|
instead of burning the whole turn budget."""
|
|
|
|
def test_parse_response_flags_empty_as_parse_failure(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, parse_failed = _parse_judge_response("")
|
|
assert done is False
|
|
assert parse_failed is True
|
|
assert "empty" in reason.lower()
|
|
|
|
def test_parse_response_flags_non_json_as_parse_failure(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, parse_failed = _parse_judge_response(
|
|
"Let me analyze whether the goal is fully satisfied based on the agent's response..."
|
|
)
|
|
assert done is False
|
|
assert parse_failed is True
|
|
assert "not json" in reason.lower()
|
|
|
|
def test_parse_response_clean_json_is_not_parse_failure(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, _, parse_failed = _parse_judge_response(
|
|
'{"done": false, "reason": "more work"}'
|
|
)
|
|
assert done is False
|
|
assert parse_failed is False
|
|
|
|
def test_api_error_does_not_count_as_parse_failure(self):
|
|
"""Transient network/API errors must not trip the auto-pause guard."""
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.side_effect = RuntimeError("connection reset")
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, _, parse_failed = goals.judge_goal("goal", "response")
|
|
assert verdict == "continue"
|
|
assert parse_failed is False
|
|
|
|
def test_empty_judge_reply_flagged_as_parse_failure(self):
|
|
"""End-to-end: judge returns empty content → parse_failed=True."""
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.return_value = MagicMock(
|
|
choices=[MagicMock(message=MagicMock(content=""))]
|
|
)
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, _, parse_failed = goals.judge_goal("goal", "response")
|
|
assert verdict == "continue"
|
|
assert parse_failed is True
|
|
|
|
def test_auto_pause_after_three_consecutive_parse_failures(self, hermes_home):
|
|
"""N=3 consecutive parse failures → auto-pause with config pointer."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager, DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES
|
|
|
|
assert DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES == 3
|
|
mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20)
|
|
mgr.set("do a thing")
|
|
|
|
with patch.object(
|
|
goals, "judge_goal", return_value=("continue", "judge returned empty response", True)
|
|
):
|
|
d1 = mgr.evaluate_after_turn("step 1")
|
|
assert d1["should_continue"] is True
|
|
assert mgr.state.consecutive_parse_failures == 1
|
|
|
|
d2 = mgr.evaluate_after_turn("step 2")
|
|
assert d2["should_continue"] is True
|
|
assert mgr.state.consecutive_parse_failures == 2
|
|
|
|
d3 = mgr.evaluate_after_turn("step 3")
|
|
assert d3["should_continue"] is False
|
|
assert d3["status"] == "paused"
|
|
assert mgr.state.consecutive_parse_failures == 3
|
|
# Message points at the config surface so the user can fix it.
|
|
assert "auxiliary" in d3["message"]
|
|
assert "goal_judge" in d3["message"]
|
|
assert "config.yaml" in d3["message"]
|
|
|
|
def test_parse_failure_counter_resets_on_good_reply(self, hermes_home):
|
|
"""A single good judge reply resets the counter — transient flakes don't pause."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="parse-fail-sid-2", default_max_turns=20)
|
|
mgr.set("another goal")
|
|
|
|
# Two parse failures…
|
|
with patch.object(
|
|
goals, "judge_goal", return_value=("continue", "not json", True)
|
|
):
|
|
mgr.evaluate_after_turn("step 1")
|
|
mgr.evaluate_after_turn("step 2")
|
|
assert mgr.state.consecutive_parse_failures == 2
|
|
|
|
# …then one clean reply resets the counter.
|
|
with patch.object(
|
|
goals, "judge_goal", return_value=("continue", "making progress", False)
|
|
):
|
|
d = mgr.evaluate_after_turn("step 3")
|
|
assert d["should_continue"] is True
|
|
assert mgr.state.consecutive_parse_failures == 0
|
|
|
|
def test_parse_failure_counter_not_incremented_by_api_errors(self, hermes_home):
|
|
"""API/transport errors must NOT count toward the auto-pause threshold."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20)
|
|
mgr.set("goal")
|
|
|
|
with patch.object(
|
|
goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False)
|
|
):
|
|
for _ in range(5):
|
|
d = mgr.evaluate_after_turn("still going")
|
|
assert d["should_continue"] is True
|
|
assert mgr.state.consecutive_parse_failures == 0
|
|
assert mgr.state.status == "active"
|
|
|
|
def test_consecutive_parse_failures_persists_across_goalmanager_reloads(
|
|
self, hermes_home
|
|
):
|
|
"""The counter must be durable so cross-session resumes see it."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager, load_goal
|
|
|
|
mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20)
|
|
mgr.set("persistent goal")
|
|
|
|
with patch.object(
|
|
goals, "judge_goal", return_value=("continue", "empty", True)
|
|
):
|
|
mgr.evaluate_after_turn("r")
|
|
mgr.evaluate_after_turn("r")
|
|
|
|
reloaded = load_goal("parse-fail-sid-4")
|
|
assert reloaded is not None
|
|
assert reloaded.consecutive_parse_failures == 2
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# /subgoal — user-added criteria
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestGoalStateSubgoalsBackcompat:
|
|
def test_old_state_meta_row_loads_without_subgoals(self):
|
|
"""A goal serialized BEFORE the subgoals field existed must
|
|
round-trip with an empty list, not crash."""
|
|
import json
|
|
from hermes_cli.goals import GoalState
|
|
|
|
legacy = json.dumps({
|
|
"goal": "do a thing",
|
|
"status": "active",
|
|
"turns_used": 2,
|
|
"max_turns": 20,
|
|
"created_at": 1.0,
|
|
"last_turn_at": 2.0,
|
|
"consecutive_parse_failures": 0,
|
|
})
|
|
state = GoalState.from_json(legacy)
|
|
assert state.goal == "do a thing"
|
|
assert state.subgoals == []
|
|
|
|
def test_subgoals_round_trip(self):
|
|
from hermes_cli.goals import GoalState
|
|
state = GoalState(goal="g", subgoals=["a", "b", "c"])
|
|
rt = GoalState.from_json(state.to_json())
|
|
assert rt.subgoals == ["a", "b", "c"]
|
|
|
|
|
|
class TestGoalManagerSubgoals:
|
|
def test_add_subgoal(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="sub-add")
|
|
mgr.set("main goal")
|
|
text = mgr.add_subgoal(" use bullet points ")
|
|
assert text == "use bullet points"
|
|
assert mgr.state.subgoals == ["use bullet points"]
|
|
|
|
def test_add_subgoal_requires_active_goal(self, hermes_home):
|
|
import pytest
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="sub-noactive")
|
|
with pytest.raises(RuntimeError):
|
|
mgr.add_subgoal("oops")
|
|
|
|
def test_add_empty_subgoal_rejected(self, hermes_home):
|
|
import pytest
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="sub-empty")
|
|
mgr.set("g")
|
|
with pytest.raises(ValueError):
|
|
mgr.add_subgoal(" ")
|
|
|
|
def test_remove_subgoal(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="sub-remove")
|
|
mgr.set("g")
|
|
mgr.add_subgoal("first")
|
|
mgr.add_subgoal("second")
|
|
mgr.add_subgoal("third")
|
|
removed = mgr.remove_subgoal(2)
|
|
assert removed == "second"
|
|
assert mgr.state.subgoals == ["first", "third"]
|
|
|
|
def test_remove_subgoal_out_of_range(self, hermes_home):
|
|
import pytest
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="sub-oob")
|
|
mgr.set("g")
|
|
mgr.add_subgoal("only")
|
|
with pytest.raises(IndexError):
|
|
mgr.remove_subgoal(5)
|
|
with pytest.raises(IndexError):
|
|
mgr.remove_subgoal(0)
|
|
|
|
def test_clear_subgoals(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="sub-clear")
|
|
mgr.set("g")
|
|
mgr.add_subgoal("a")
|
|
mgr.add_subgoal("b")
|
|
prev = mgr.clear_subgoals()
|
|
assert prev == 2
|
|
assert mgr.state.subgoals == []
|
|
|
|
def test_subgoals_persist_across_reloads(self, hermes_home):
|
|
"""Subgoals stored in SessionDB survive a fresh GoalManager."""
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="sub-persist")
|
|
mgr.set("g")
|
|
mgr.add_subgoal("first")
|
|
mgr.add_subgoal("second")
|
|
|
|
mgr2 = GoalManager(session_id="sub-persist")
|
|
assert mgr2.state.subgoals == ["first", "second"]
|
|
|
|
|
|
class TestContinuationPromptWithSubgoals:
|
|
def test_empty_subgoals_uses_original_template(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="cp-empty")
|
|
mgr.set("ship the feature")
|
|
prompt = mgr.next_continuation_prompt()
|
|
assert prompt is not None
|
|
assert "ship the feature" in prompt
|
|
assert "Additional criteria" not in prompt
|
|
|
|
def test_with_subgoals_includes_them(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="cp-with")
|
|
mgr.set("ship the feature")
|
|
mgr.add_subgoal("write tests")
|
|
mgr.add_subgoal("update docs")
|
|
prompt = mgr.next_continuation_prompt()
|
|
assert prompt is not None
|
|
assert "ship the feature" in prompt
|
|
assert "Additional criteria" in prompt
|
|
assert "1. write tests" in prompt
|
|
assert "2. update docs" in prompt
|
|
|
|
|
|
class TestJudgeGoalWithSubgoals:
|
|
def test_judge_uses_subgoals_template_when_provided(self, hermes_home):
|
|
"""judge_goal switches templates when subgoals is non-empty.
|
|
|
|
We don't actually call the model — we patch the aux client to
|
|
capture the prompt that would be sent.
|
|
"""
|
|
from unittest.mock import patch, MagicMock
|
|
from hermes_cli import goals
|
|
|
|
captured = {}
|
|
|
|
class _FakeMsg:
|
|
content = '{"done": true, "reason": "all done"}'
|
|
class _FakeChoice:
|
|
message = _FakeMsg()
|
|
class _FakeResp:
|
|
choices = [_FakeChoice()]
|
|
class _FakeClient:
|
|
class chat:
|
|
class completions:
|
|
@staticmethod
|
|
def create(**kwargs):
|
|
captured.update(kwargs)
|
|
return _FakeResp()
|
|
|
|
with patch.object(goals, "get_text_auxiliary_client",
|
|
return_value=(_FakeClient, "fake-model"), create=True), \
|
|
patch.object(goals, "get_auxiliary_extra_body",
|
|
return_value=None, create=True), \
|
|
patch("agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(_FakeClient, "fake-model")), \
|
|
patch("agent.auxiliary_client.get_auxiliary_extra_body",
|
|
return_value=None):
|
|
verdict, reason, parse_failed = goals.judge_goal(
|
|
"ship the feature",
|
|
"ok shipped",
|
|
subgoals=["write tests", "update docs"],
|
|
)
|
|
|
|
# The aux client was called with a prompt that includes the subgoals.
|
|
sent_messages = captured.get("messages") or []
|
|
user_msg = next((m["content"] for m in sent_messages if m["role"] == "user"), "")
|
|
assert "Additional criteria" in user_msg
|
|
assert "1. write tests" in user_msg
|
|
assert "2. update docs" in user_msg
|
|
assert "every additional criterion" in user_msg
|
|
assert verdict == "done"
|
|
|
|
def test_judge_uses_original_template_when_no_subgoals(self, hermes_home):
|
|
from unittest.mock import patch
|
|
from hermes_cli import goals
|
|
|
|
captured = {}
|
|
|
|
class _FakeMsg:
|
|
content = '{"done": true, "reason": "ok"}'
|
|
class _FakeChoice:
|
|
message = _FakeMsg()
|
|
class _FakeResp:
|
|
choices = [_FakeChoice()]
|
|
class _FakeClient:
|
|
class chat:
|
|
class completions:
|
|
@staticmethod
|
|
def create(**kwargs):
|
|
captured.update(kwargs)
|
|
return _FakeResp()
|
|
|
|
with patch("agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(_FakeClient, "fake-model")), \
|
|
patch("agent.auxiliary_client.get_auxiliary_extra_body",
|
|
return_value=None):
|
|
goals.judge_goal("ship it", "done", subgoals=None)
|
|
|
|
sent_messages = captured.get("messages") or []
|
|
user_msg = next((m["content"] for m in sent_messages if m["role"] == "user"), "")
|
|
assert "Additional criteria" not in user_msg
|
|
assert "ship it" in user_msg
|
|
|
|
|
|
class TestStatusLineSubgoalCount:
|
|
def test_status_line_no_subgoals(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="sl-empty")
|
|
mgr.set("ship it")
|
|
line = mgr.status_line()
|
|
assert "ship it" in line
|
|
assert "subgoal" not in line.lower()
|
|
|
|
def test_status_line_with_subgoals(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="sl-with")
|
|
mgr.set("ship it")
|
|
mgr.add_subgoal("a")
|
|
mgr.add_subgoal("b")
|
|
line = mgr.status_line()
|
|
assert "2 subgoals" in line
|