diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py index 0f0f3abd9c..894cdddb01 100644 --- a/hermes_cli/goals.py +++ b/hermes_cli/goals.py @@ -47,6 +47,14 @@ DEFAULT_MAX_TURNS = 20 DEFAULT_JUDGE_TIMEOUT = 30.0 # Cap how much of the last response + recent messages we send to the judge. _JUDGE_RESPONSE_SNIPPET_CHARS = 4000 +# After this many consecutive judge *parse* failures (empty output / non-JSON), +# the loop auto-pauses and points the user at the goal_judge config. API / +# transport errors do NOT count toward this — those are transient. This guards +# against small models (e.g. deepseek-v4-flash) that cannot follow the strict +# JSON reply contract; without it the loop runs until the turn budget is +# exhausted with every reply shaped like `judge returned empty response` or +# `judge reply was not JSON`. +DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3 CONTINUATION_PROMPT_TEMPLATE = ( @@ -99,6 +107,7 @@ class GoalState: last_verdict: Optional[str] = None # "done" | "continue" | "skipped" last_reason: Optional[str] = None paused_reason: Optional[str] = None # why we auto-paused (budget, etc.) + consecutive_parse_failures: int = 0 # judge-output parse failures in a row def to_json(self) -> str: return json.dumps(asdict(self), ensure_ascii=False) @@ -116,6 +125,7 @@ class GoalState: last_verdict=data.get("last_verdict"), last_reason=data.get("last_reason"), paused_reason=data.get("paused_reason"), + consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0), ) @@ -220,13 +230,17 @@ def _truncate(text: str, limit: int) -> str: _JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL) -def _parse_judge_response(raw: str) -> Tuple[bool, str]: - """Parse the judge's reply. Fail-open to ``(False, "")``. +def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: + """Parse the judge's reply. Fail-open to ``(False, "", parse_failed)``. - Returns ``(done, reason)``. + Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the + judge returned output that couldn't be interpreted as the expected JSON + verdict (empty body, prose, malformed JSON). Callers use that flag to + auto-pause after N consecutive parse failures so a weak judge model + doesn't silently burn the turn budget. """ if not raw: - return False, "judge returned empty response" + return False, "judge returned empty response", True text = raw.strip() @@ -252,7 +266,7 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str]: data = None if not isinstance(data, dict): - return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}" + return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True done_val = data.get("done") if isinstance(done_val, str): @@ -262,7 +276,7 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str]: reason = str(data.get("reason") or "").strip() if not reason: reason = "no reason provided" - return done, reason + return done, reason, False def judge_goal( @@ -270,36 +284,42 @@ def judge_goal( last_response: str, *, timeout: float = DEFAULT_JUDGE_TIMEOUT, -) -> Tuple[str, str]: +) -> Tuple[str, str, bool]: """Ask the auxiliary model whether the goal is satisfied. - Returns ``(verdict, reason)`` where verdict is ``"done"``, ``"continue"``, - or ``"skipped"`` (when the judge couldn't be reached). + Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``, + ``"continue"``, or ``"skipped"`` (when the judge couldn't be reached). - This is deliberately fail-open: any error returns ``("continue", "...")`` - so a broken judge doesn't wedge progress — the turn budget is the - backstop. + ``parse_failed`` is True only when the judge call succeeded but its output + was unusable (empty or non-JSON). API/transport errors return False — they + are transient and should fail-open silently. Callers use this flag to + auto-pause after N consecutive parse failures (see + ``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``). + + This is deliberately fail-open: any error returns ``("continue", "...", False)`` + so a broken judge doesn't wedge progress — the turn budget and the + consecutive-parse-failures auto-pause are the backstops. """ if not goal.strip(): - return "skipped", "empty goal" + return "skipped", "empty goal", False if not last_response.strip(): # No substantive reply this turn — almost certainly not done yet. - return "continue", "empty response (nothing to evaluate)" + return "continue", "empty response (nothing to evaluate)", False try: from agent.auxiliary_client import get_text_auxiliary_client except Exception as exc: logger.debug("goal judge: auxiliary client import failed: %s", exc) - return "continue", "auxiliary client unavailable" + return "continue", "auxiliary client unavailable", False try: client, model = get_text_auxiliary_client("goal_judge") except Exception as exc: logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc) - return "continue", "auxiliary client unavailable" + return "continue", "auxiliary client unavailable", False if client is None or not model: - return "continue", "no auxiliary client configured" + return "continue", "no auxiliary client configured", False prompt = JUDGE_USER_PROMPT_TEMPLATE.format( goal=_truncate(goal, 2000), @@ -319,17 +339,17 @@ def judge_goal( ) except Exception as exc: logger.info("goal judge: API call failed (%s) — falling through to continue", exc) - return "continue", f"judge error: {type(exc).__name__}" + return "continue", f"judge error: {type(exc).__name__}", False try: raw = resp.choices[0].message.content or "" except Exception: raw = "" - done, reason = _parse_judge_response(raw) + done, reason, parse_failed = _parse_judge_response(raw) verdict = "done" if done else "continue" logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120)) - return verdict, reason + return verdict, reason, parse_failed # ────────────────────────────────────────────────────────────────────── @@ -473,10 +493,18 @@ class GoalManager: state.turns_used += 1 state.last_turn_at = time.time() - verdict, reason = judge_goal(state.goal, last_response) + verdict, reason, parse_failed = judge_goal(state.goal, last_response) state.last_verdict = verdict state.last_reason = reason + # Track consecutive judge parse failures. Reset on any usable reply, + # including API / transport errors (parse_failed=False) so a flaky + # network doesn't trip the auto-pause meant for bad judge models. + if parse_failed: + state.consecutive_parse_failures += 1 + else: + state.consecutive_parse_failures = 0 + if verdict == "done": state.status = "done" save_goal(self.session_id, state) @@ -489,6 +517,36 @@ class GoalManager: "message": f"✓ Goal achieved: {reason}", } + # Auto-pause when the judge model can't produce the expected JSON + # verdict N turns in a row. Points the user at the goal_judge config + # so they can route this side task to a model that follows the + # contract (e.g. google/gemini-3-flash-preview). Without this guard, + # weak judge models burn the entire turn budget returning prose or + # empty strings. + if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES: + state.status = "paused" + state.paused_reason = ( + f"judge model returned unparseable output {state.consecutive_parse_failures} turns in a row" + ) + save_goal(self.session_id, state) + return { + "status": "paused", + "should_continue": False, + "continuation_prompt": None, + "verdict": "continue", + "reason": reason, + "message": ( + f"⏸ Goal paused — the judge model ({state.consecutive_parse_failures} turns) " + "isn't returning the required JSON verdict. Route the judge to a stricter " + "model in ~/.hermes/config.yaml:\n" + " auxiliary:\n" + " goal_judge:\n" + " provider: openrouter\n" + " model: google/gemini-3-flash-preview\n" + "Then /goal resume to continue." + ), + } + if state.turns_used >= state.max_turns: state.status = "paused" state.paused_reason = f"turn budget exhausted ({state.turns_used}/{state.max_turns})" diff --git a/scripts/release.py b/scripts/release.py index 07e2a3a747..5259b01b7a 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -58,6 +58,7 @@ AUTHOR_MAP = { "223003280+Abd0r@users.noreply.github.com": "Abd0r", "abdielv@proton.me": "AJV20", "mason@growagainorchids.com": "masonjames", + "ytchen0719@gmail.com": "liquidchen", "am@studio1.tailb672fe.ts.net": "subtract0", "axmaiqiu@gmail.com": "qWaitCrypto", "159539633+MottledShadow@users.noreply.github.com": "MottledShadow", diff --git a/tests/gateway/test_goal_verdict_send.py b/tests/gateway/test_goal_verdict_send.py index bb66851608..14f536aa4f 100644 --- a/tests/gateway/test_goal_verdict_send.py +++ b/tests/gateway/test_goal_verdict_send.py @@ -61,8 +61,9 @@ class _RecordingAdapter: return _R() -def _make_runner_with_adapter(): +def _make_runner_with_adapter(session_id: str = None): from gateway.run import GatewayRunner + import uuid runner = object.__new__(GatewayRunner) runner.config = GatewayConfig( @@ -74,9 +75,12 @@ def _make_runner_with_adapter(): runner._queued_events = {} src = _make_source() + # Default to a unique session_id so xdist parallel runs on the same worker + # don't see each other's GoalManager state (DEFAULT_DB_PATH gets frozen at + # module-import time, defeating per-test HERMES_HOME monkeypatches). session_entry = SessionEntry( session_key=build_session_key(src), - session_id="goal-sess-1", + session_id=session_id or f"goal-sess-{uuid.uuid4().hex[:8]}", created_at=datetime.now(), updated_at=datetime.now(), platform=Platform.TELEGRAM, @@ -103,8 +107,8 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home): mgr = GoalManager(session_entry.session_id) mgr.set("ship the feature") - with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped")): - runner._post_turn_goal_continuation( + with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)): + await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, final_response="I shipped the feature.", @@ -132,8 +136,8 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home): mgr = GoalManager(session_entry.session_id) mgr.set("polish the docs") - with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work")): - runner._post_turn_goal_continuation( + with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)): + await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, final_response="here's a partial edit", @@ -160,8 +164,8 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home): state.turns_used = 2 save_goal(session_entry.session_id, state) - with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going")): - runner._post_turn_goal_continuation( + with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)): + await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, final_response="still partial", @@ -181,7 +185,7 @@ async def test_goal_verdict_skipped_when_no_active_goal(hermes_home): """No goal set → the hook is a no-op. Nothing is sent, nothing enqueued.""" runner, adapter, session_entry, src = _make_runner_with_adapter() - runner._post_turn_goal_continuation( + await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, final_response="anything", @@ -207,9 +211,9 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home): runner.adapters[Platform.TELEGRAM] = _NoSendAdapter() - with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok")): + with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)): # must not raise - runner._post_turn_goal_continuation( + await runner._post_turn_goal_continuation( session_entry=session_entry, source=src, final_response="whatever", diff --git a/tests/hermes_cli/test_goals.py b/tests/hermes_cli/test_goals.py index a21c5f4749..b5afd716c9 100644 --- a/tests/hermes_cli/test_goals.py +++ b/tests/hermes_cli/test_goals.py @@ -40,14 +40,14 @@ class TestParseJudgeResponse: def test_clean_json_done(self): from hermes_cli.goals import _parse_judge_response - done, reason = _parse_judge_response('{"done": true, "reason": "all good"}') + done, reason, _ = _parse_judge_response('{"done": true, "reason": "all good"}') assert done is True assert reason == "all good" def test_clean_json_continue(self): from hermes_cli.goals import _parse_judge_response - done, reason = _parse_judge_response('{"done": false, "reason": "more work needed"}') + done, reason, _ = _parse_judge_response('{"done": false, "reason": "more work needed"}') assert done is False assert reason == "more work needed" @@ -55,7 +55,7 @@ class TestParseJudgeResponse: from hermes_cli.goals import _parse_judge_response raw = '```json\n{"done": true, "reason": "done"}\n```' - done, reason = _parse_judge_response(raw) + done, reason, _ = _parse_judge_response(raw) assert done is True assert "done" in reason @@ -64,7 +64,7 @@ class TestParseJudgeResponse: from hermes_cli.goals import _parse_judge_response raw = 'Looking at this... the agent says X. Verdict: {"done": false, "reason": "partial"}' - done, reason = _parse_judge_response(raw) + done, reason, _ = _parse_judge_response(raw) assert done is False assert reason == "partial" @@ -72,24 +72,24 @@ class TestParseJudgeResponse: from hermes_cli.goals import _parse_judge_response for s in ("true", "yes", "done", "1"): - done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}') + done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}') assert done is True for s in ("false", "no", "not yet"): - done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}') + done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}') assert done is False def test_malformed_json_fails_open(self): """Non-JSON → not done, with error-ish reason (so judge_goal can map to continue).""" from hermes_cli.goals import _parse_judge_response - done, reason = _parse_judge_response("this is not json at all") + done, reason, _ = _parse_judge_response("this is not json at all") assert done is False assert reason # non-empty def test_empty_response(self): from hermes_cli.goals import _parse_judge_response - done, reason = _parse_judge_response("") + done, reason, _ = _parse_judge_response("") assert done is False assert reason @@ -103,13 +103,13 @@ class TestJudgeGoal: def test_empty_goal_skipped(self): from hermes_cli.goals import judge_goal - verdict, _ = judge_goal("", "some response") + verdict, _, _ = judge_goal("", "some response") assert verdict == "skipped" def test_empty_response_continues(self): from hermes_cli.goals import judge_goal - verdict, _ = judge_goal("ship the thing", "") + verdict, _, _ = judge_goal("ship the thing", "") assert verdict == "continue" def test_no_aux_client_continues(self): @@ -120,7 +120,7 @@ class TestJudgeGoal: "agent.auxiliary_client.get_text_auxiliary_client", return_value=(None, None), ): - verdict, _ = goals.judge_goal("my goal", "my response") + verdict, _, _ = goals.judge_goal("my goal", "my response") assert verdict == "continue" def test_api_error_continues(self): @@ -133,7 +133,7 @@ class TestJudgeGoal: "agent.auxiliary_client.get_text_auxiliary_client", return_value=(fake_client, "judge-model"), ): - verdict, reason = goals.judge_goal("goal", "response") + verdict, reason, _ = goals.judge_goal("goal", "response") assert verdict == "continue" assert "judge error" in reason.lower() @@ -152,7 +152,7 @@ class TestJudgeGoal: "agent.auxiliary_client.get_text_auxiliary_client", return_value=(fake_client, "judge-model"), ): - verdict, reason = goals.judge_goal("goal", "agent response") + verdict, reason, _ = goals.judge_goal("goal", "agent response") assert verdict == "done" assert reason == "achieved" @@ -171,7 +171,7 @@ class TestJudgeGoal: "agent.auxiliary_client.get_text_auxiliary_client", return_value=(fake_client, "judge-model"), ): - verdict, reason = goals.judge_goal("goal", "agent response") + verdict, reason, _ = goals.judge_goal("goal", "agent response") assert verdict == "continue" assert reason == "not yet" @@ -260,7 +260,7 @@ class TestGoalManager: mgr = GoalManager(session_id="eval-sid-1") mgr.set("ship it") - with patch.object(goals, "judge_goal", return_value=("done", "shipped")): + with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)): decision = mgr.evaluate_after_turn("I shipped the feature.") assert decision["verdict"] == "done" @@ -276,7 +276,7 @@ class TestGoalManager: mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5) mgr.set("a long goal") - with patch.object(goals, "judge_goal", return_value=("continue", "more work")): + with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)): decision = mgr.evaluate_after_turn("made some progress") assert decision["verdict"] == "continue" @@ -294,7 +294,7 @@ class TestGoalManager: mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2) mgr.set("hard goal") - with patch.object(goals, "judge_goal", return_value=("continue", "not yet")): + with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)): d1 = mgr.evaluate_after_turn("step 1") assert d1["should_continue"] is True assert mgr.state.turns_used == 1 @@ -356,3 +356,161 @@ def test_goal_command_dispatches_in_cli_registry_helpers(): assert "/goal" in COMMANDS session_cmds = COMMANDS_BY_CATEGORY.get("Session", {}) assert "/goal" in session_cmds + + +# ────────────────────────────────────────────────────────────────────── +# Auto-pause on consecutive judge parse failures +# ────────────────────────────────────────────────────────────────────── + + +class TestJudgeParseFailureAutoPause: + """Regression: weak judge models (e.g. deepseek-v4-flash) that return + empty strings or non-JSON prose must auto-pause the loop after N turns + instead of burning the whole turn budget.""" + + def test_parse_response_flags_empty_as_parse_failure(self): + from hermes_cli.goals import _parse_judge_response + + done, reason, parse_failed = _parse_judge_response("") + assert done is False + assert parse_failed is True + assert "empty" in reason.lower() + + def test_parse_response_flags_non_json_as_parse_failure(self): + from hermes_cli.goals import _parse_judge_response + + done, reason, parse_failed = _parse_judge_response( + "Let me analyze whether the goal is fully satisfied based on the agent's response..." + ) + assert done is False + assert parse_failed is True + assert "not json" in reason.lower() + + def test_parse_response_clean_json_is_not_parse_failure(self): + from hermes_cli.goals import _parse_judge_response + + done, _, parse_failed = _parse_judge_response( + '{"done": false, "reason": "more work"}' + ) + assert done is False + assert parse_failed is False + + def test_api_error_does_not_count_as_parse_failure(self): + """Transient network/API errors must not trip the auto-pause guard.""" + from hermes_cli import goals + + fake_client = MagicMock() + fake_client.chat.completions.create.side_effect = RuntimeError("connection reset") + with patch( + "agent.auxiliary_client.get_text_auxiliary_client", + return_value=(fake_client, "judge-model"), + ): + verdict, _, parse_failed = goals.judge_goal("goal", "response") + assert verdict == "continue" + assert parse_failed is False + + def test_empty_judge_reply_flagged_as_parse_failure(self): + """End-to-end: judge returns empty content → parse_failed=True.""" + from hermes_cli import goals + + fake_client = MagicMock() + fake_client.chat.completions.create.return_value = MagicMock( + choices=[MagicMock(message=MagicMock(content=""))] + ) + with patch( + "agent.auxiliary_client.get_text_auxiliary_client", + return_value=(fake_client, "judge-model"), + ): + verdict, _, parse_failed = goals.judge_goal("goal", "response") + assert verdict == "continue" + assert parse_failed is True + + def test_auto_pause_after_three_consecutive_parse_failures(self, hermes_home): + """N=3 consecutive parse failures → auto-pause with config pointer.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager, DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES + + assert DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES == 3 + mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20) + mgr.set("do a thing") + + with patch.object( + goals, "judge_goal", return_value=("continue", "judge returned empty response", True) + ): + d1 = mgr.evaluate_after_turn("step 1") + assert d1["should_continue"] is True + assert mgr.state.consecutive_parse_failures == 1 + + d2 = mgr.evaluate_after_turn("step 2") + assert d2["should_continue"] is True + assert mgr.state.consecutive_parse_failures == 2 + + d3 = mgr.evaluate_after_turn("step 3") + assert d3["should_continue"] is False + assert d3["status"] == "paused" + assert mgr.state.consecutive_parse_failures == 3 + # Message points at the config surface so the user can fix it. + assert "auxiliary" in d3["message"] + assert "goal_judge" in d3["message"] + assert "config.yaml" in d3["message"] + + def test_parse_failure_counter_resets_on_good_reply(self, hermes_home): + """A single good judge reply resets the counter — transient flakes don't pause.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="parse-fail-sid-2", default_max_turns=20) + mgr.set("another goal") + + # Two parse failures… + with patch.object( + goals, "judge_goal", return_value=("continue", "not json", True) + ): + mgr.evaluate_after_turn("step 1") + mgr.evaluate_after_turn("step 2") + assert mgr.state.consecutive_parse_failures == 2 + + # …then one clean reply resets the counter. + with patch.object( + goals, "judge_goal", return_value=("continue", "making progress", False) + ): + d = mgr.evaluate_after_turn("step 3") + assert d["should_continue"] is True + assert mgr.state.consecutive_parse_failures == 0 + + def test_parse_failure_counter_not_incremented_by_api_errors(self, hermes_home): + """API/transport errors must NOT count toward the auto-pause threshold.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager + + mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20) + mgr.set("goal") + + with patch.object( + goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False) + ): + for _ in range(5): + d = mgr.evaluate_after_turn("still going") + assert d["should_continue"] is True + assert mgr.state.consecutive_parse_failures == 0 + assert mgr.state.status == "active" + + def test_consecutive_parse_failures_persists_across_goalmanager_reloads( + self, hermes_home + ): + """The counter must be durable so cross-session resumes see it.""" + from hermes_cli import goals + from hermes_cli.goals import GoalManager, load_goal + + mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20) + mgr.set("persistent goal") + + with patch.object( + goals, "judge_goal", return_value=("continue", "empty", True) + ): + mgr.evaluate_after_turn("r") + mgr.evaluate_after_turn("r") + + reloaded = load_goal("parse-fail-sid-4") + assert reloaded is not None + assert reloaded.consecutive_parse_failures == 2