mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
* feat(goals): /goal checklist + /subgoal user controls
Two-phase judge for /goal — Phase A decomposes the goal into a detailed
checklist on first turn; Phase B evaluates each pending item harshly
against the agent's most recent response. The goal completes only when
every item is in a terminal status (completed or impossible). Adds
/subgoal so the user can append, complete, mark impossible, undo,
remove, or clear items the judge missed or got wrong.
Mechanics:
- GoalState gains `checklist` and `decomposed` fields, both backwards
compatible (old state_meta rows load unchanged).
- Phase A: aux call writes a harsh, exhaustive checklist; biased toward
more items not fewer. Falls through to legacy freeform judge when
decompose fails.
- Phase B: judge gets the checklist + last-response snippet + path to
a per-session conversation dump at <HERMES_HOME>/goals/<sid>.json.
A bounded read_file tool (max 5 calls per turn, restricted to that
one file) lets the judge inspect history when the snippet is
ambiguous. Stickiness in code: terminal items are frozen, only the
user can revert via /subgoal undo.
- Continuation prompt shows checklist progress when non-empty;
reverts to old prompt when empty.
- Status line shows M/N done counts.
CLI + gateway + TUI gateway all pass the agent reference into
evaluate_after_turn so the dump can be written. Gateway-side
/subgoal is allowed mid-run since it only modifies the checklist
the judge consults at turn boundaries.
Tests: 24 new cases — backcompat round-trip, Phase A decompose,
Phase B updates + new_items + stickiness, user override flows,
conversation dump (incl. unsafe-sid sanitization), judge read_file
restriction. Existing freeform-mode tests updated to patch the
renamed `judge_goal_freeform` and skip Phase A explicitly.
* fix(goals): off-by-one in judge index, message-list plumbing, prompt tuning
Three live-test findings from running /goal end-to-end against
gemini-3-flash-preview as the judge:
1. Off-by-one bug — the judge sees the checklist rendered with 1-based
indices ('1. [ ] foo, 2. [ ] bar') but the apply layer indexed
state.checklist as 0-based. Result: every judge update landed on
the wrong item, evidence got attached to neighbouring rows, and
the genuine 'first pending' item (usually #1) never got marked.
Fix: convert 1 → 0 in _parse_evaluate_response. Also tightened the
user prompt to call out the 1-based scheme explicitly. New tests
cover the parser conversion + an end-to-end fake-judge round-trip.
2. Conversation dump never happened — _extract_agent_messages tried
common AIAgent attribute names (.messages, .conversation_history,
etc.) but AIAgent doesn't expose the message list as an instance
attribute; it lives inside run_conversation()'s scope. Result: the
judge's read_file tool always saw history_path=unavailable. Fix:
added an explicit messages= kwarg to evaluate_after_turn that all
three call sites (CLI, gateway, TUI gateway) now pass directly.
Agent-attribute extraction kept as back-compat fallback.
3. Prompt was too harsh on simple goals. The original 'be HARSH,
default to leaving items pending' wording made the judge refuse
to mark 'file exists' completed even after the agent ran ls,
test -f, os.path.isfile, and find — burning the entire 8-turn
budget on a fizzbuzz task. Softened to 'strict but not absurd'
with explicit guidance on what counts as evidence and a directive
not to require re-proving items already established earlier.
Re-tested live with the same fizzbuzz goal: now terminates in 2
turns with all 8 checklist items correctly attributed to their
own evidence. /subgoal user-action flow (add / complete / undo /
impossible) verified live as well.
1073 lines
44 KiB
Python
1073 lines
44 KiB
Python
"""Tests for hermes_cli/goals.py — persistent cross-turn goals."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
import pytest
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Fixtures
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
@pytest.fixture
|
|
def hermes_home(tmp_path, monkeypatch):
|
|
"""Isolated HERMES_HOME so SessionDB.state_meta writes don't clobber the real one."""
|
|
from pathlib import Path
|
|
|
|
home = tmp_path / ".hermes"
|
|
home.mkdir()
|
|
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
|
monkeypatch.setenv("HERMES_HOME", str(home))
|
|
|
|
# Bust the goal-module's DB cache for each test so it re-resolves HERMES_HOME.
|
|
from hermes_cli import goals
|
|
|
|
goals._DB_CACHE.clear()
|
|
yield home
|
|
goals._DB_CACHE.clear()
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# _parse_judge_response
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestParseJudgeResponse:
|
|
def test_clean_json_done(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, _ = _parse_judge_response('{"done": true, "reason": "all good"}')
|
|
assert done is True
|
|
assert reason == "all good"
|
|
|
|
def test_clean_json_continue(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, _ = _parse_judge_response('{"done": false, "reason": "more work needed"}')
|
|
assert done is False
|
|
assert reason == "more work needed"
|
|
|
|
def test_json_in_markdown_fence(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
raw = '```json\n{"done": true, "reason": "done"}\n```'
|
|
done, reason, _ = _parse_judge_response(raw)
|
|
assert done is True
|
|
assert "done" in reason
|
|
|
|
def test_json_embedded_in_prose(self):
|
|
"""Some models prefix reasoning before emitting JSON — we extract it."""
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
raw = 'Looking at this... the agent says X. Verdict: {"done": false, "reason": "partial"}'
|
|
done, reason, _ = _parse_judge_response(raw)
|
|
assert done is False
|
|
assert reason == "partial"
|
|
|
|
def test_string_done_values(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
for s in ("true", "yes", "done", "1"):
|
|
done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
|
|
assert done is True
|
|
for s in ("false", "no", "not yet"):
|
|
done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
|
|
assert done is False
|
|
|
|
def test_malformed_json_fails_open(self):
|
|
"""Non-JSON → not done, with error-ish reason (so judge_goal can map to continue)."""
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, _ = _parse_judge_response("this is not json at all")
|
|
assert done is False
|
|
assert reason # non-empty
|
|
|
|
def test_empty_response(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, _ = _parse_judge_response("")
|
|
assert done is False
|
|
assert reason
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# judge_goal — fail-open semantics
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestJudgeGoal:
|
|
def test_empty_goal_skipped(self):
|
|
from hermes_cli.goals import judge_goal
|
|
|
|
verdict, _, _ = judge_goal("", "some response")
|
|
assert verdict == "skipped"
|
|
|
|
def test_empty_response_continues(self):
|
|
from hermes_cli.goals import judge_goal
|
|
|
|
verdict, _, _ = judge_goal("ship the thing", "")
|
|
assert verdict == "continue"
|
|
|
|
def test_no_aux_client_continues(self):
|
|
"""Fail-open: if no aux client, we must return continue, not skipped/done."""
|
|
from hermes_cli import goals
|
|
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(None, None),
|
|
):
|
|
verdict, _, _ = goals.judge_goal("my goal", "my response")
|
|
assert verdict == "continue"
|
|
|
|
def test_api_error_continues(self):
|
|
"""Judge exception → fail-open continue (don't wedge progress on judge bugs)."""
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.side_effect = RuntimeError("boom")
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, reason, _ = goals.judge_goal("goal", "response")
|
|
assert verdict == "continue"
|
|
assert "judge error" in reason.lower()
|
|
|
|
def test_judge_says_done(self):
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.return_value = MagicMock(
|
|
choices=[
|
|
MagicMock(
|
|
message=MagicMock(content='{"done": true, "reason": "achieved"}')
|
|
)
|
|
]
|
|
)
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, reason, _ = goals.judge_goal("goal", "agent response")
|
|
assert verdict == "done"
|
|
assert reason == "achieved"
|
|
|
|
def test_judge_says_continue(self):
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.return_value = MagicMock(
|
|
choices=[
|
|
MagicMock(
|
|
message=MagicMock(content='{"done": false, "reason": "not yet"}')
|
|
)
|
|
]
|
|
)
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, reason, _ = goals.judge_goal("goal", "agent response")
|
|
assert verdict == "continue"
|
|
assert reason == "not yet"
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# GoalManager lifecycle + persistence
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestGoalManager:
|
|
def test_no_goal_initial(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-1")
|
|
assert mgr.state is None
|
|
assert not mgr.is_active()
|
|
assert not mgr.has_goal()
|
|
assert "No active goal" in mgr.status_line()
|
|
|
|
def test_set_then_status(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-2", default_max_turns=5)
|
|
state = mgr.set("port the thing")
|
|
assert state.goal == "port the thing"
|
|
assert state.status == "active"
|
|
assert state.max_turns == 5
|
|
assert state.turns_used == 0
|
|
assert mgr.is_active()
|
|
assert "active" in mgr.status_line().lower()
|
|
assert "port the thing" in mgr.status_line()
|
|
|
|
def test_set_rejects_empty(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-3")
|
|
with pytest.raises(ValueError):
|
|
mgr.set("")
|
|
with pytest.raises(ValueError):
|
|
mgr.set(" ")
|
|
|
|
def test_pause_and_resume(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-4")
|
|
mgr.set("goal text")
|
|
mgr.pause(reason="user-paused")
|
|
assert mgr.state.status == "paused"
|
|
assert not mgr.is_active()
|
|
assert mgr.has_goal()
|
|
|
|
mgr.resume()
|
|
assert mgr.state.status == "active"
|
|
assert mgr.is_active()
|
|
|
|
def test_clear(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="test-sid-5")
|
|
mgr.set("goal")
|
|
mgr.clear()
|
|
assert mgr.state is None
|
|
assert not mgr.is_active()
|
|
|
|
def test_persistence_across_managers(self, hermes_home):
|
|
"""Key invariant: a second manager on the same session sees the goal.
|
|
|
|
This is what makes /resume work — each session rebinds its
|
|
GoalManager and picks up the saved state.
|
|
"""
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr1 = GoalManager(session_id="persist-sid")
|
|
mgr1.set("do the thing")
|
|
|
|
mgr2 = GoalManager(session_id="persist-sid")
|
|
assert mgr2.state is not None
|
|
assert mgr2.state.goal == "do the thing"
|
|
assert mgr2.is_active()
|
|
|
|
def test_evaluate_after_turn_done(self, hermes_home):
|
|
"""Judge says done → status=done, no continuation.
|
|
|
|
Skips Phase-A decompose by patching ``decompose_goal`` to return
|
|
an empty checklist so the manager falls through to the freeform
|
|
judge path (legacy behavior preserved when decompose is unavailable).
|
|
"""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="eval-sid-1")
|
|
mgr.set("ship it")
|
|
|
|
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
|
patch.object(goals, "judge_goal_freeform", return_value=("done", "shipped", False)):
|
|
decision = mgr.evaluate_after_turn("I shipped the feature.")
|
|
|
|
assert decision["verdict"] == "done"
|
|
assert decision["should_continue"] is False
|
|
assert decision["continuation_prompt"] is None
|
|
assert mgr.state.status == "done"
|
|
assert mgr.state.turns_used == 1
|
|
|
|
def test_evaluate_after_turn_continue_under_budget(self, hermes_home):
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
|
|
mgr.set("a long goal")
|
|
|
|
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
|
patch.object(goals, "judge_goal_freeform", return_value=("continue", "more work", False)):
|
|
decision = mgr.evaluate_after_turn("made some progress")
|
|
|
|
assert decision["verdict"] == "continue"
|
|
assert decision["should_continue"] is True
|
|
assert decision["continuation_prompt"] is not None
|
|
assert "a long goal" in decision["continuation_prompt"]
|
|
assert mgr.state.status == "active"
|
|
assert mgr.state.turns_used == 1
|
|
|
|
def test_evaluate_after_turn_budget_exhausted(self, hermes_home):
|
|
"""When turn budget hits ceiling, auto-pause instead of continuing."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
|
|
mgr.set("hard goal")
|
|
|
|
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
|
patch.object(goals, "judge_goal_freeform", return_value=("continue", "not yet", False)):
|
|
d1 = mgr.evaluate_after_turn("step 1")
|
|
assert d1["should_continue"] is True
|
|
assert mgr.state.turns_used == 1
|
|
assert mgr.state.status == "active"
|
|
|
|
d2 = mgr.evaluate_after_turn("step 2")
|
|
# turns_used is now 2 which equals max_turns → paused
|
|
assert d2["should_continue"] is False
|
|
assert mgr.state.status == "paused"
|
|
assert mgr.state.turns_used == 2
|
|
assert "budget" in (mgr.state.paused_reason or "").lower()
|
|
|
|
def test_evaluate_after_turn_inactive(self, hermes_home):
|
|
"""evaluate_after_turn is a no-op when goal isn't active."""
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="eval-sid-4")
|
|
d = mgr.evaluate_after_turn("anything")
|
|
assert d["verdict"] == "inactive"
|
|
assert d["should_continue"] is False
|
|
|
|
mgr.set("a goal")
|
|
mgr.pause()
|
|
d2 = mgr.evaluate_after_turn("anything")
|
|
assert d2["verdict"] == "inactive"
|
|
assert d2["should_continue"] is False
|
|
|
|
def test_continuation_prompt_shape(self, hermes_home):
|
|
"""The continuation prompt must include the goal text verbatim —
|
|
and must be safe to inject as a user-role message (prompt-cache
|
|
invariants: no system-prompt mutation)."""
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="cont-sid")
|
|
mgr.set("port goal command to hermes")
|
|
prompt = mgr.next_continuation_prompt()
|
|
assert prompt is not None
|
|
assert "port goal command to hermes" in prompt
|
|
assert prompt.strip() # non-empty
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Smoke: CommandDef is wired
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_goal_command_in_registry():
|
|
from hermes_cli.commands import resolve_command
|
|
|
|
cmd = resolve_command("goal")
|
|
assert cmd is not None
|
|
assert cmd.name == "goal"
|
|
|
|
|
|
def test_goal_command_dispatches_in_cli_registry_helpers():
|
|
"""goal shows up in autocomplete / help categories alongside other Session cmds."""
|
|
from hermes_cli.commands import COMMANDS, COMMANDS_BY_CATEGORY
|
|
|
|
assert "/goal" in COMMANDS
|
|
session_cmds = COMMANDS_BY_CATEGORY.get("Session", {})
|
|
assert "/goal" in session_cmds
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Auto-pause on consecutive judge parse failures
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestJudgeParseFailureAutoPause:
|
|
"""Regression: weak judge models (e.g. deepseek-v4-flash) that return
|
|
empty strings or non-JSON prose must auto-pause the loop after N turns
|
|
instead of burning the whole turn budget."""
|
|
|
|
def test_parse_response_flags_empty_as_parse_failure(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, parse_failed = _parse_judge_response("")
|
|
assert done is False
|
|
assert parse_failed is True
|
|
assert "empty" in reason.lower()
|
|
|
|
def test_parse_response_flags_non_json_as_parse_failure(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, reason, parse_failed = _parse_judge_response(
|
|
"Let me analyze whether the goal is fully satisfied based on the agent's response..."
|
|
)
|
|
assert done is False
|
|
assert parse_failed is True
|
|
assert "not json" in reason.lower()
|
|
|
|
def test_parse_response_clean_json_is_not_parse_failure(self):
|
|
from hermes_cli.goals import _parse_judge_response
|
|
|
|
done, _, parse_failed = _parse_judge_response(
|
|
'{"done": false, "reason": "more work"}'
|
|
)
|
|
assert done is False
|
|
assert parse_failed is False
|
|
|
|
def test_api_error_does_not_count_as_parse_failure(self):
|
|
"""Transient network/API errors must not trip the auto-pause guard."""
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.side_effect = RuntimeError("connection reset")
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, _, parse_failed = goals.judge_goal("goal", "response")
|
|
assert verdict == "continue"
|
|
assert parse_failed is False
|
|
|
|
def test_empty_judge_reply_flagged_as_parse_failure(self):
|
|
"""End-to-end: judge returns empty content → parse_failed=True."""
|
|
from hermes_cli import goals
|
|
|
|
fake_client = MagicMock()
|
|
fake_client.chat.completions.create.return_value = MagicMock(
|
|
choices=[MagicMock(message=MagicMock(content=""))]
|
|
)
|
|
with patch(
|
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
|
return_value=(fake_client, "judge-model"),
|
|
):
|
|
verdict, _, parse_failed = goals.judge_goal("goal", "response")
|
|
assert verdict == "continue"
|
|
assert parse_failed is True
|
|
|
|
def test_auto_pause_after_three_consecutive_parse_failures(self, hermes_home):
|
|
"""N=3 consecutive parse failures → auto-pause with config pointer."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager, DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES
|
|
|
|
assert DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES == 3
|
|
mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20)
|
|
mgr.set("do a thing")
|
|
|
|
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
|
patch.object(
|
|
goals, "judge_goal_freeform",
|
|
return_value=("continue", "judge returned empty response", True),
|
|
):
|
|
d1 = mgr.evaluate_after_turn("step 1")
|
|
assert d1["should_continue"] is True
|
|
assert mgr.state.consecutive_parse_failures == 1
|
|
|
|
d2 = mgr.evaluate_after_turn("step 2")
|
|
assert d2["should_continue"] is True
|
|
assert mgr.state.consecutive_parse_failures == 2
|
|
|
|
d3 = mgr.evaluate_after_turn("step 3")
|
|
assert d3["should_continue"] is False
|
|
assert d3["status"] == "paused"
|
|
assert mgr.state.consecutive_parse_failures == 3
|
|
# Message points at the config surface so the user can fix it.
|
|
assert "auxiliary" in d3["message"]
|
|
assert "goal_judge" in d3["message"]
|
|
assert "config.yaml" in d3["message"]
|
|
|
|
def test_parse_failure_counter_resets_on_good_reply(self, hermes_home):
|
|
"""A single good judge reply resets the counter — transient flakes don't pause."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="parse-fail-sid-2", default_max_turns=20)
|
|
mgr.set("another goal")
|
|
|
|
# Two parse failures…
|
|
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
|
patch.object(
|
|
goals, "judge_goal_freeform",
|
|
return_value=("continue", "not json", True),
|
|
):
|
|
mgr.evaluate_after_turn("step 1")
|
|
mgr.evaluate_after_turn("step 2")
|
|
assert mgr.state.consecutive_parse_failures == 2
|
|
|
|
# …then one clean reply resets the counter.
|
|
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
|
patch.object(
|
|
goals, "judge_goal_freeform",
|
|
return_value=("continue", "making progress", False),
|
|
):
|
|
d = mgr.evaluate_after_turn("step 3")
|
|
assert d["should_continue"] is True
|
|
assert mgr.state.consecutive_parse_failures == 0
|
|
|
|
def test_parse_failure_counter_not_incremented_by_api_errors(self, hermes_home):
|
|
"""API/transport errors must NOT count toward the auto-pause threshold."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20)
|
|
mgr.set("goal")
|
|
|
|
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
|
patch.object(
|
|
goals, "judge_goal_freeform",
|
|
return_value=("continue", "judge error: RuntimeError", False),
|
|
):
|
|
for _ in range(5):
|
|
d = mgr.evaluate_after_turn("still going")
|
|
assert d["should_continue"] is True
|
|
assert mgr.state.consecutive_parse_failures == 0
|
|
assert mgr.state.status == "active"
|
|
|
|
def test_consecutive_parse_failures_persists_across_goalmanager_reloads(
|
|
self, hermes_home
|
|
):
|
|
"""The counter must be durable so cross-session resumes see it."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager, load_goal
|
|
|
|
mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20)
|
|
mgr.set("persistent goal")
|
|
|
|
with patch.object(goals, "decompose_goal", return_value=([], "stub")), \
|
|
patch.object(
|
|
goals, "judge_goal_freeform",
|
|
return_value=("continue", "empty", True),
|
|
):
|
|
mgr.evaluate_after_turn("r")
|
|
mgr.evaluate_after_turn("r")
|
|
|
|
reloaded = load_goal("parse-fail-sid-4")
|
|
assert reloaded is not None
|
|
assert reloaded.consecutive_parse_failures == 2
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Checklist mode: GoalState backcompat + ChecklistItem
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestGoalStateBackcompat:
|
|
def test_old_state_meta_row_loads_without_checklist_fields(self):
|
|
"""A goal serialized BEFORE the checklist fields existed must
|
|
round-trip through GoalState.from_json with empty defaults."""
|
|
from hermes_cli.goals import GoalState
|
|
|
|
legacy_json = json.dumps({
|
|
"goal": "do the thing",
|
|
"status": "active",
|
|
"turns_used": 3,
|
|
"max_turns": 20,
|
|
"created_at": 1.0,
|
|
"last_turn_at": 2.0,
|
|
"last_verdict": "continue",
|
|
"last_reason": "still working",
|
|
"paused_reason": None,
|
|
"consecutive_parse_failures": 1,
|
|
})
|
|
state = GoalState.from_json(legacy_json)
|
|
assert state.goal == "do the thing"
|
|
assert state.checklist == []
|
|
assert state.decomposed is False
|
|
|
|
def test_new_state_round_trip(self):
|
|
from hermes_cli.goals import (
|
|
ChecklistItem,
|
|
GoalState,
|
|
ITEM_COMPLETED,
|
|
ITEM_PENDING,
|
|
ADDED_BY_JUDGE,
|
|
ADDED_BY_USER,
|
|
)
|
|
|
|
state = GoalState(
|
|
goal="g",
|
|
decomposed=True,
|
|
checklist=[
|
|
ChecklistItem(text="a", status=ITEM_COMPLETED,
|
|
added_by=ADDED_BY_JUDGE, evidence="done"),
|
|
ChecklistItem(text="b", status=ITEM_PENDING,
|
|
added_by=ADDED_BY_USER),
|
|
],
|
|
)
|
|
round_tripped = GoalState.from_json(state.to_json())
|
|
assert round_tripped.decomposed is True
|
|
assert len(round_tripped.checklist) == 2
|
|
assert round_tripped.checklist[0].text == "a"
|
|
assert round_tripped.checklist[0].status == ITEM_COMPLETED
|
|
assert round_tripped.checklist[0].evidence == "done"
|
|
assert round_tripped.checklist[1].added_by == ADDED_BY_USER
|
|
|
|
def test_checklist_counts_and_all_terminal(self):
|
|
from hermes_cli.goals import (
|
|
ChecklistItem, GoalState,
|
|
ITEM_COMPLETED, ITEM_IMPOSSIBLE, ITEM_PENDING,
|
|
)
|
|
|
|
state = GoalState(
|
|
goal="g",
|
|
checklist=[
|
|
ChecklistItem(text="a", status=ITEM_COMPLETED),
|
|
ChecklistItem(text="b", status=ITEM_IMPOSSIBLE),
|
|
ChecklistItem(text="c", status=ITEM_PENDING),
|
|
],
|
|
)
|
|
total, done, imp, pending = state.checklist_counts()
|
|
assert (total, done, imp, pending) == (3, 1, 1, 1)
|
|
assert state.all_terminal() is False
|
|
|
|
state.checklist[2].status = ITEM_IMPOSSIBLE
|
|
assert state.all_terminal() is True
|
|
|
|
def test_empty_checklist_is_not_all_terminal(self):
|
|
"""Empty list must NOT be considered done."""
|
|
from hermes_cli.goals import GoalState
|
|
|
|
state = GoalState(goal="g")
|
|
assert state.all_terminal() is False
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Phase A: decompose
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestPhaseADecompose:
|
|
def test_decompose_writes_checklist_and_marks_decomposed(self, hermes_home):
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_JUDGE
|
|
|
|
mgr = GoalManager(session_id="phase-a-sid-1")
|
|
mgr.set("build a website")
|
|
|
|
items = [{"text": "homepage exists"}, {"text": "is mobile-friendly"}]
|
|
with patch.object(goals, "decompose_goal", return_value=(items, None)):
|
|
d = mgr.evaluate_after_turn("(initial response)")
|
|
|
|
assert d["verdict"] == "decompose"
|
|
assert d["should_continue"] is True
|
|
# Phase A produces a continuation prompt that includes the checklist.
|
|
assert d["continuation_prompt"] is not None
|
|
assert "Checklist progress" in d["continuation_prompt"]
|
|
assert mgr.state.decomposed is True
|
|
assert len(mgr.state.checklist) == 2
|
|
assert mgr.state.checklist[0].text == "homepage exists"
|
|
assert mgr.state.checklist[0].status == ITEM_PENDING
|
|
assert mgr.state.checklist[0].added_by == ADDED_BY_JUDGE
|
|
|
|
def test_decompose_only_runs_once(self, hermes_home):
|
|
"""Decomposed=True after first call. Subsequent calls go to Phase B."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="phase-a-sid-2")
|
|
mgr.set("g")
|
|
|
|
with patch.object(
|
|
goals, "decompose_goal", return_value=([{"text": "x"}], None)
|
|
) as decompose_mock, patch.object(
|
|
goals, "evaluate_checklist",
|
|
return_value=({"updates": [], "new_items": [], "reason": "..."}, False),
|
|
) as eval_mock:
|
|
mgr.evaluate_after_turn("turn 1")
|
|
mgr.evaluate_after_turn("turn 2")
|
|
mgr.evaluate_after_turn("turn 3")
|
|
|
|
assert decompose_mock.call_count == 1
|
|
assert eval_mock.call_count == 2
|
|
|
|
def test_decompose_failure_falls_back_to_freeform(self, hermes_home):
|
|
"""If decompose returns no items, manager falls through to freeform judge."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import GoalManager
|
|
|
|
mgr = GoalManager(session_id="phase-a-sid-3")
|
|
mgr.set("g")
|
|
|
|
with patch.object(goals, "decompose_goal", return_value=([], "model error")), \
|
|
patch.object(goals, "judge_goal_freeform",
|
|
return_value=("done", "shipped", False)):
|
|
d = mgr.evaluate_after_turn("done!")
|
|
|
|
assert d["verdict"] == "done"
|
|
assert mgr.state.decomposed is True
|
|
assert mgr.state.checklist == []
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Phase B: evaluate (checklist mode)
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestPhaseBChecklist:
|
|
def _make_decomposed_mgr(self, sid: str, items):
|
|
"""Helper: skip Phase A, install a decomposed checklist directly."""
|
|
from hermes_cli.goals import (
|
|
GoalManager, ChecklistItem, ITEM_PENDING, ADDED_BY_JUDGE,
|
|
)
|
|
from hermes_cli import goals as _g
|
|
mgr = GoalManager(session_id=sid)
|
|
mgr.set("a goal")
|
|
mgr.state.decomposed = True
|
|
mgr.state.checklist = [
|
|
ChecklistItem(text=t, status=ITEM_PENDING, added_by=ADDED_BY_JUDGE)
|
|
for t in items
|
|
]
|
|
_g.save_goal(sid, mgr.state)
|
|
return mgr
|
|
|
|
def test_judge_flips_pending_to_completed(self, hermes_home):
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import ITEM_COMPLETED, ITEM_PENDING
|
|
|
|
mgr = self._make_decomposed_mgr("phase-b-1", ["a", "b", "c"])
|
|
with patch.object(
|
|
goals, "evaluate_checklist",
|
|
return_value=(
|
|
{
|
|
"updates": [
|
|
{"index": 0, "status": "completed", "evidence": "done"},
|
|
{"index": 1, "status": "completed", "evidence": "shipped"},
|
|
],
|
|
"new_items": [],
|
|
"reason": "made progress",
|
|
},
|
|
False,
|
|
),
|
|
):
|
|
d = mgr.evaluate_after_turn("agent did stuff")
|
|
|
|
assert d["verdict"] == "continue"
|
|
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
assert mgr.state.checklist[0].evidence == "done"
|
|
assert mgr.state.checklist[1].status == ITEM_COMPLETED
|
|
assert mgr.state.checklist[2].status == ITEM_PENDING
|
|
|
|
def test_goal_done_when_all_items_terminal(self, hermes_home):
|
|
from hermes_cli import goals
|
|
|
|
mgr = self._make_decomposed_mgr("phase-b-2", ["a", "b"])
|
|
with patch.object(
|
|
goals, "evaluate_checklist",
|
|
return_value=(
|
|
{
|
|
"updates": [
|
|
{"index": 0, "status": "completed", "evidence": "ok"},
|
|
{"index": 1, "status": "impossible", "evidence": "blocked"},
|
|
],
|
|
"new_items": [],
|
|
"reason": "all done or blocked",
|
|
},
|
|
False,
|
|
),
|
|
):
|
|
d = mgr.evaluate_after_turn("response")
|
|
|
|
assert d["verdict"] == "done"
|
|
assert d["should_continue"] is False
|
|
assert mgr.state.status == "done"
|
|
|
|
def test_stickiness_judge_cannot_regress_completed(self, hermes_home):
|
|
"""Once an item is completed, judge updates trying to flip it back are ignored."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import ITEM_COMPLETED
|
|
|
|
mgr = self._make_decomposed_mgr("phase-b-stick", ["a"])
|
|
# First turn completes item 0.
|
|
with patch.object(
|
|
goals, "evaluate_checklist",
|
|
return_value=(
|
|
{
|
|
"updates": [{"index": 0, "status": "completed", "evidence": "yes"}],
|
|
"new_items": [],
|
|
"reason": "done",
|
|
},
|
|
False,
|
|
),
|
|
):
|
|
mgr.evaluate_after_turn("turn 1")
|
|
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
# Second turn: judge tries to send a non-terminal update.
|
|
# _parse_evaluate_response already filters non-terminal, but at the
|
|
# apply layer we also skip terminal items entirely. Smoke both.
|
|
with patch.object(
|
|
goals, "evaluate_checklist",
|
|
return_value=(
|
|
{
|
|
"updates": [{"index": 0, "status": "impossible", "evidence": "regress"}],
|
|
"new_items": [],
|
|
"reason": "trying to regress",
|
|
},
|
|
False,
|
|
),
|
|
):
|
|
mgr.evaluate_after_turn("turn 2")
|
|
# Sticky: status stays completed, evidence unchanged.
|
|
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
assert mgr.state.checklist[0].evidence == "yes"
|
|
|
|
def test_judge_appends_new_items(self, hermes_home):
|
|
from hermes_cli import goals
|
|
|
|
mgr = self._make_decomposed_mgr("phase-b-new", ["a"])
|
|
with patch.object(
|
|
goals, "evaluate_checklist",
|
|
return_value=(
|
|
{
|
|
"updates": [],
|
|
"new_items": [{"text": "newly discovered"}, {"text": "also this"}],
|
|
"reason": "found more work",
|
|
},
|
|
False,
|
|
),
|
|
):
|
|
mgr.evaluate_after_turn("response")
|
|
assert len(mgr.state.checklist) == 3
|
|
assert mgr.state.checklist[1].text == "newly discovered"
|
|
assert mgr.state.checklist[1].added_by == "judge"
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# /subgoal user controls
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestSubgoalUserControls:
|
|
def test_add_subgoal_appends_user_item(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager, ITEM_PENDING, ADDED_BY_USER
|
|
|
|
mgr = GoalManager(session_id="user-sid-1")
|
|
mgr.set("g")
|
|
item = mgr.add_subgoal("user added")
|
|
assert item.text == "user added"
|
|
assert item.status == ITEM_PENDING
|
|
assert item.added_by == ADDED_BY_USER
|
|
assert len(mgr.state.checklist) == 1
|
|
|
|
def test_add_subgoal_requires_active_goal(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="user-sid-2")
|
|
with pytest.raises(RuntimeError):
|
|
mgr.add_subgoal("x")
|
|
|
|
def test_add_subgoal_rejects_empty_text(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="user-sid-3")
|
|
mgr.set("g")
|
|
with pytest.raises(ValueError):
|
|
mgr.add_subgoal(" ")
|
|
|
|
def test_mark_subgoal_uses_1_based_index(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_IMPOSSIBLE
|
|
mgr = GoalManager(session_id="user-sid-4")
|
|
mgr.set("g")
|
|
mgr.add_subgoal("a")
|
|
mgr.add_subgoal("b")
|
|
mgr.add_subgoal("c")
|
|
mgr.mark_subgoal(2, "completed")
|
|
mgr.mark_subgoal(3, "impossible")
|
|
assert mgr.state.checklist[0].status == "pending"
|
|
assert mgr.state.checklist[1].status == ITEM_COMPLETED
|
|
assert mgr.state.checklist[2].status == ITEM_IMPOSSIBLE
|
|
|
|
def test_mark_subgoal_rejects_invalid_index(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="user-sid-5")
|
|
mgr.set("g")
|
|
mgr.add_subgoal("a")
|
|
with pytest.raises(IndexError):
|
|
mgr.mark_subgoal(5, "completed")
|
|
with pytest.raises(IndexError):
|
|
mgr.mark_subgoal(0, "completed")
|
|
|
|
def test_user_can_revert_terminal_item(self, hermes_home):
|
|
"""User mark_subgoal bypasses stickiness — only path to revert."""
|
|
from hermes_cli.goals import GoalManager, ITEM_COMPLETED, ITEM_PENDING
|
|
mgr = GoalManager(session_id="user-sid-6")
|
|
mgr.set("g")
|
|
mgr.add_subgoal("a")
|
|
mgr.mark_subgoal(1, "completed")
|
|
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
mgr.mark_subgoal(1, "pending")
|
|
assert mgr.state.checklist[0].status == ITEM_PENDING
|
|
|
|
def test_remove_subgoal(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="user-sid-7")
|
|
mgr.set("g")
|
|
mgr.add_subgoal("a")
|
|
mgr.add_subgoal("b")
|
|
mgr.add_subgoal("c")
|
|
removed = mgr.remove_subgoal(2)
|
|
assert removed.text == "b"
|
|
assert [it.text for it in mgr.state.checklist] == ["a", "c"]
|
|
|
|
def test_clear_checklist_resets_decomposed(self, hermes_home):
|
|
from hermes_cli.goals import GoalManager
|
|
mgr = GoalManager(session_id="user-sid-8")
|
|
mgr.set("g")
|
|
mgr.state.decomposed = True
|
|
mgr.add_subgoal("a")
|
|
mgr.clear_checklist()
|
|
assert mgr.state.checklist == []
|
|
assert mgr.state.decomposed is False
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Conversation dump
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestConversationDump:
|
|
def test_dump_writes_messages_to_goals_dir(self, hermes_home):
|
|
from hermes_cli.goals import dump_conversation, conversation_dump_path
|
|
|
|
msgs = [
|
|
{"role": "user", "content": "hi"},
|
|
{"role": "assistant", "content": "hello"},
|
|
]
|
|
path = dump_conversation("dump-sid-1", msgs)
|
|
assert path is not None
|
|
assert path.exists()
|
|
# Path is under <HERMES_HOME>/goals/<sid>.json
|
|
assert path.parent.name == "goals"
|
|
assert path.name == "dump-sid-1.json"
|
|
|
|
loaded = json.loads(path.read_text())
|
|
assert loaded == msgs
|
|
|
|
# conversation_dump_path returns the same path
|
|
assert conversation_dump_path("dump-sid-1") == path
|
|
|
|
def test_dump_handles_unsafe_session_id(self, hermes_home):
|
|
from hermes_cli.goals import dump_conversation
|
|
|
|
path = dump_conversation("evil/../../sid", [{"role": "user", "content": "x"}])
|
|
assert path is not None
|
|
# No traversal — slashes are normalized to underscores. (Periods are
|
|
# preserved because they're legitimate in filenames; the resulting
|
|
# name still cannot escape <HERMES_HOME>/goals/ since path
|
|
# separators are gone.)
|
|
assert "/" not in path.name
|
|
assert path.parent.name == "goals"
|
|
# Verify the resolved path stays under the goals dir.
|
|
from hermes_cli.goals import _goals_dump_dir
|
|
goals_dir = _goals_dump_dir().resolve()
|
|
assert str(path.resolve()).startswith(str(goals_dir))
|
|
|
|
def test_dump_skips_when_messages_empty(self, hermes_home):
|
|
from hermes_cli.goals import dump_conversation
|
|
assert dump_conversation("sid", []) is None
|
|
assert dump_conversation("", [{"role": "user", "content": "x"}]) is None
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Judge read_file tool: path restriction
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestJudgeReadFile:
|
|
def test_restricted_to_allowed_path(self, hermes_home, tmp_path):
|
|
from hermes_cli.goals import _judge_read_file
|
|
|
|
allowed = tmp_path / "allowed.json"
|
|
allowed.write_text("hello\nworld\n")
|
|
|
|
ok = _judge_read_file(str(allowed), allowed_path=allowed)
|
|
loaded = json.loads(ok)
|
|
assert loaded["content"].startswith("hello")
|
|
|
|
# Try to read a different file.
|
|
sneaky = tmp_path / "secret.txt"
|
|
sneaky.write_text("nope\n")
|
|
denied = _judge_read_file(str(sneaky), allowed_path=allowed)
|
|
loaded = json.loads(denied)
|
|
assert "error" in loaded
|
|
assert "restricted" in loaded["error"]
|
|
|
|
def test_pagination(self, hermes_home, tmp_path):
|
|
from hermes_cli.goals import _judge_read_file
|
|
f = tmp_path / "big.json"
|
|
f.write_text("\n".join(f"line-{i}" for i in range(50)) + "\n")
|
|
|
|
# offset=10, limit=5 should return lines 10..14.
|
|
result = json.loads(_judge_read_file(str(f), offset=10, limit=5, allowed_path=f))
|
|
assert result["returned"] == 5
|
|
assert "line-9" in result["content"] # 1-based: line 10 == zero-indexed 9
|
|
assert result["next_offset"] == 15
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Index conversion: judge emits 1-based, apply layer uses 0-based
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestJudgeIndexConversion:
|
|
def test_parse_evaluate_converts_1based_to_0based(self):
|
|
"""The judge sees the checklist with 1-based indices (rendered as
|
|
'1. [ ] foo, 2. [ ] bar'). It emits updates with those same indices.
|
|
``_parse_evaluate_response`` must convert them to 0-based so the
|
|
apply layer can index ``state.checklist`` directly.
|
|
"""
|
|
from hermes_cli.goals import _parse_evaluate_response
|
|
|
|
raw = '''
|
|
{"updates": [
|
|
{"index": 1, "status": "completed", "evidence": "first item"},
|
|
{"index": 3, "status": "impossible", "evidence": "third item"}
|
|
],
|
|
"new_items": [],
|
|
"reason": "evaluated"}
|
|
'''
|
|
parsed, parse_failed = _parse_evaluate_response(raw)
|
|
assert parse_failed is False
|
|
# 1 → 0, 3 → 2
|
|
assert [u["index"] for u in parsed["updates"]] == [0, 2]
|
|
assert parsed["updates"][0]["evidence"] == "first item"
|
|
assert parsed["updates"][1]["status"] == "impossible"
|
|
|
|
def test_full_round_trip_judge_index_to_state(self, hermes_home):
|
|
"""End-to-end: judge emits 1-based, parser converts, apply layer
|
|
flips the right items in state.checklist."""
|
|
from hermes_cli import goals
|
|
from hermes_cli.goals import (
|
|
GoalManager, ChecklistItem, ITEM_PENDING, ITEM_COMPLETED,
|
|
ADDED_BY_JUDGE,
|
|
)
|
|
|
|
mgr = GoalManager(session_id="idx-round-trip")
|
|
mgr.set("g")
|
|
mgr.state.decomposed = True
|
|
mgr.state.checklist = [
|
|
ChecklistItem(text="first", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
|
|
ChecklistItem(text="second", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
|
|
ChecklistItem(text="third", status=ITEM_PENDING, added_by=ADDED_BY_JUDGE),
|
|
]
|
|
goals.save_goal("idx-round-trip", mgr.state)
|
|
|
|
# Simulate the judge returning a raw-JSON Phase-B reply via the
|
|
# auxiliary client: the parser handles the 1-based → 0-based
|
|
# conversion so the apply layer flips item 1 (text="first").
|
|
class FakeMessage:
|
|
content = '''
|
|
{"updates": [{"index": 1, "status": "completed", "evidence": "first done"}],
|
|
"new_items": [],
|
|
"reason": "..."}
|
|
'''
|
|
tool_calls = None
|
|
|
|
class FakeChoice:
|
|
message = FakeMessage()
|
|
|
|
class FakeResponse:
|
|
choices = [FakeChoice()]
|
|
|
|
class FakeClient:
|
|
class chat:
|
|
class completions:
|
|
@staticmethod
|
|
def create(**kwargs):
|
|
return FakeResponse()
|
|
|
|
with patch.object(goals, "_get_judge_client", return_value=(FakeClient, "fake-model")):
|
|
mgr.evaluate_after_turn("ran the script and item 1 is done")
|
|
|
|
# Item 1 (text="first") should now be completed.
|
|
assert mgr.state.checklist[0].text == "first"
|
|
assert mgr.state.checklist[0].status == ITEM_COMPLETED
|
|
assert mgr.state.checklist[0].evidence == "first done"
|
|
# Other items still pending.
|
|
assert mgr.state.checklist[1].status == ITEM_PENDING
|
|
assert mgr.state.checklist[2].status == ITEM_PENDING
|