fix(goals): auto-pause when judge model returns unparseable output

Weak judge models (e.g. deepseek-v4-flash) return empty strings or prose when asked for the strict {done, reason} JSON verdict. The old code failed-open to continue on every such turn, burning the entire turn budget with log lines like judge returned empty response judge reply was not JSON: "Let me analyze whether the goal..." and /goal clear could not stop it mid-loop without /stop. After N=3 consecutive *parse* failures (transport/API errors don't count — those are transient), the loop auto-pauses and prints: ⏸ Goal paused — the judge model (3 turns) isn't returning the required JSON verdict. Route the judge to a stricter model in ~/.hermes/config.yaml: auxiliary: goal_judge: provider: openrouter model: google/gemini-3-flash-preview Then /goal resume to continue. The counter resets on any usable reply (both "done"/"continue" and API errors) and persists across GoalManager reloads so cross-session resumes carry the correct state. Also fixes test_goal_verdict_send.py sharing a hardcoded session_id across tests — the shared id only worked because the previous _post_turn_goal_continuation was a never-awaited coroutine. Now that PR #19160 made it properly awaited, the xdist test-leakage bug surfaced. Each test gets a unique session_id via uuid suffix.
2026-05-08 03:01:47 +00:00 · 2026-05-07 17:19:47 -07:00 · 2026-05-07 17:19:47 -07:00 · 307c85e5c1
commit 307c85e5c1
parent 03ddff8897
4 changed files with 270 additions and 49 deletions
--- a/hermes_cli/goals.py
+++ b/hermes_cli/goals.py
@ -47,6 +47,14 @@ DEFAULT_MAX_TURNS = 20
 DEFAULT_JUDGE_TIMEOUT = 30.0
 # Cap how much of the last response + recent messages we send to the judge.
 _JUDGE_RESPONSE_SNIPPET_CHARS = 4000
 # After this many consecutive judge *parse* failures (empty output / non-JSON),
 # the loop auto-pauses and points the user at the goal_judge config. API /
 # transport errors do NOT count toward this — those are transient. This guards
 # against small models (e.g. deepseek-v4-flash) that cannot follow the strict
 # JSON reply contract; without it the loop runs until the turn budget is
 # exhausted with every reply shaped like `judge returned empty response` or
 # `judge reply was not JSON`.
 DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3
 CONTINUATION_PROMPT_TEMPLATE = (
@ -99,6 +107,7 @@ class GoalState:
    last_verdict: Optional[str] = None        # "done" | "continue" | "skipped"
    last_reason: Optional[str] = None
    paused_reason: Optional[str] = None       # why we auto-paused (budget, etc.)
    consecutive_parse_failures: int = 0       # judge-output parse failures in a row
    def to_json(self) -> str:
        return json.dumps(asdict(self), ensure_ascii=False)
@ -116,6 +125,7 @@ class GoalState:
            last_verdict=data.get("last_verdict"),
            last_reason=data.get("last_reason"),
            paused_reason=data.get("paused_reason"),
            consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0),
        )
@ -220,13 +230,17 @@ def _truncate(text: str, limit: int) -> str:
 _JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)
-def _parse_judge_response(raw: str) -> Tuple[bool, str]:
+def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
-    """Parse the judge's reply. Fail-open to ``(False, "<reason>")``.
+    """Parse the judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.
-    Returns ``(done, reason)``.
+    Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the
    judge returned output that couldn't be interpreted as the expected JSON
    verdict (empty body, prose, malformed JSON). Callers use that flag to
    auto-pause after N consecutive parse failures so a weak judge model
    doesn't silently burn the turn budget.
    """
    if not raw:
-        return False, "judge returned empty response"
+        return False, "judge returned empty response", True
    text = raw.strip()
@ -252,7 +266,7 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str]:
                data = None
    if not isinstance(data, dict):
-        return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}"
+        return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True
    done_val = data.get("done")
    if isinstance(done_val, str):
@ -262,7 +276,7 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str]:
    reason = str(data.get("reason") or "").strip()
    if not reason:
        reason = "no reason provided"
-    return done, reason
+    return done, reason, False
 def judge_goal(
@ -270,36 +284,42 @@ def judge_goal(
    last_response: str,
    *,
    timeout: float = DEFAULT_JUDGE_TIMEOUT,
-) -> Tuple[str, str]:
+) -> Tuple[str, str, bool]:
    """Ask the auxiliary model whether the goal is satisfied.
-    Returns ``(verdict, reason)`` where verdict is ``"done"``, ``"continue"``,
+    Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``,
-    or ``"skipped"`` (when the judge couldn't be reached).
+    ``"continue"``, or ``"skipped"`` (when the judge couldn't be reached).
-    This is deliberately fail-open: any error returns ``("continue", "...")``
+    ``parse_failed`` is True only when the judge call succeeded but its output
-    so a broken judge doesn't wedge progress — the turn budget is the
+    was unusable (empty or non-JSON). API/transport errors return False — they
-    backstop.
+    are transient and should fail-open silently. Callers use this flag to
    auto-pause after N consecutive parse failures (see
    ``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``).
    This is deliberately fail-open: any error returns ``("continue", "...", False)``
    so a broken judge doesn't wedge progress — the turn budget and the
    consecutive-parse-failures auto-pause are the backstops.
    """
    if not goal.strip():
-        return "skipped", "empty goal"
+        return "skipped", "empty goal", False
    if not last_response.strip():
        # No substantive reply this turn — almost certainly not done yet.
-        return "continue", "empty response (nothing to evaluate)"
+        return "continue", "empty response (nothing to evaluate)", False
    try:
        from agent.auxiliary_client import get_text_auxiliary_client
    except Exception as exc:
        logger.debug("goal judge: auxiliary client import failed: %s", exc)
-        return "continue", "auxiliary client unavailable"
+        return "continue", "auxiliary client unavailable", False
    try:
        client, model = get_text_auxiliary_client("goal_judge")
    except Exception as exc:
        logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc)
-        return "continue", "auxiliary client unavailable"
+        return "continue", "auxiliary client unavailable", False
    if client is None or not model:
-        return "continue", "no auxiliary client configured"
+        return "continue", "no auxiliary client configured", False
    prompt = JUDGE_USER_PROMPT_TEMPLATE.format(
        goal=_truncate(goal, 2000),
@ -319,17 +339,17 @@ def judge_goal(
        )
    except Exception as exc:
        logger.info("goal judge: API call failed (%s) — falling through to continue", exc)
-        return "continue", f"judge error: {type(exc).__name__}"
+        return "continue", f"judge error: {type(exc).__name__}", False
    try:
        raw = resp.choices[0].message.content or ""
    except Exception:
        raw = ""
-    done, reason = _parse_judge_response(raw)
+    done, reason, parse_failed = _parse_judge_response(raw)
    verdict = "done" if done else "continue"
    logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120))
-    return verdict, reason
+    return verdict, reason, parse_failed
 # ──────────────────────────────────────────────────────────────────────
@ -473,10 +493,18 @@ class GoalManager:
        state.turns_used += 1
        state.last_turn_at = time.time()
-        verdict, reason = judge_goal(state.goal, last_response)
+        verdict, reason, parse_failed = judge_goal(state.goal, last_response)
        state.last_verdict = verdict
        state.last_reason = reason
        # Track consecutive judge parse failures. Reset on any usable reply,
        # including API / transport errors (parse_failed=False) so a flaky
        # network doesn't trip the auto-pause meant for bad judge models.
        if parse_failed:
            state.consecutive_parse_failures += 1
        else:
            state.consecutive_parse_failures = 0
        if verdict == "done":
            state.status = "done"
            save_goal(self.session_id, state)
@ -489,6 +517,36 @@ class GoalManager:
                "message": f"✓ Goal achieved: {reason}",
            }
        # Auto-pause when the judge model can't produce the expected JSON
        # verdict N turns in a row. Points the user at the goal_judge config
        # so they can route this side task to a model that follows the
        # contract (e.g. google/gemini-3-flash-preview). Without this guard,
        # weak judge models burn the entire turn budget returning prose or
        # empty strings.
        if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES:
            state.status = "paused"
            state.paused_reason = (
                f"judge model returned unparseable output {state.consecutive_parse_failures} turns in a row"
            )
            save_goal(self.session_id, state)
            return {
                "status": "paused",
                "should_continue": False,
                "continuation_prompt": None,
                "verdict": "continue",
                "reason": reason,
                "message": (
                    f"⏸ Goal paused — the judge model ({state.consecutive_parse_failures} turns) "
                    "isn't returning the required JSON verdict. Route the judge to a stricter "
                    "model in ~/.hermes/config.yaml:\n"
                    "  auxiliary:\n"
                    "    goal_judge:\n"
                    "      provider: openrouter\n"
                    "      model: google/gemini-3-flash-preview\n"
                    "Then /goal resume to continue."
                ),
            }
        if state.turns_used >= state.max_turns:
            state.status = "paused"
            state.paused_reason = f"turn budget exhausted ({state.turns_used}/{state.max_turns})"
--- a/scripts/release.py
+++ b/scripts/release.py
@ -58,6 +58,7 @@ AUTHOR_MAP = {
    "223003280+Abd0r@users.noreply.github.com": "Abd0r",
    "abdielv@proton.me": "AJV20",
    "mason@growagainorchids.com": "masonjames",
    "ytchen0719@gmail.com": "liquidchen",
    "am@studio1.tailb672fe.ts.net": "subtract0",
    "axmaiqiu@gmail.com": "qWaitCrypto",
    "159539633+MottledShadow@users.noreply.github.com": "MottledShadow",
--- a/tests/gateway/test_goal_verdict_send.py
+++ b/tests/gateway/test_goal_verdict_send.py
@ -61,8 +61,9 @@ class _RecordingAdapter:
        return _R()
-def _make_runner_with_adapter():
+def _make_runner_with_adapter(session_id: str = None):
    from gateway.run import GatewayRunner
    import uuid
    runner = object.__new__(GatewayRunner)
    runner.config = GatewayConfig(
@ -74,9 +75,12 @@ def _make_runner_with_adapter():
    runner._queued_events = {}
    src = _make_source()
    # Default to a unique session_id so xdist parallel runs on the same worker
    # don't see each other's GoalManager state (DEFAULT_DB_PATH gets frozen at
    # module-import time, defeating per-test HERMES_HOME monkeypatches).
    session_entry = SessionEntry(
        session_key=build_session_key(src),
-        session_id="goal-sess-1",
+        session_id=session_id or f"goal-sess-{uuid.uuid4().hex[:8]}",
        created_at=datetime.now(),
        updated_at=datetime.now(),
        platform=Platform.TELEGRAM,
@ -103,8 +107,8 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home):
    mgr = GoalManager(session_entry.session_id)
    mgr.set("ship the feature")
-    with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped")):
+    with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)):
-        runner._post_turn_goal_continuation(
+        await runner._post_turn_goal_continuation(
            session_entry=session_entry,
            source=src,
            final_response="I shipped the feature.",
@ -132,8 +136,8 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home):
    mgr = GoalManager(session_entry.session_id)
    mgr.set("polish the docs")
-    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work")):
+    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)):
-        runner._post_turn_goal_continuation(
+        await runner._post_turn_goal_continuation(
            session_entry=session_entry,
            source=src,
            final_response="here's a partial edit",
@ -160,8 +164,8 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home):
    state.turns_used = 2
    save_goal(session_entry.session_id, state)
-    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going")):
+    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)):
-        runner._post_turn_goal_continuation(
+        await runner._post_turn_goal_continuation(
            session_entry=session_entry,
            source=src,
            final_response="still partial",
@ -181,7 +185,7 @@ async def test_goal_verdict_skipped_when_no_active_goal(hermes_home):
    """No goal set → the hook is a no-op. Nothing is sent, nothing enqueued."""
    runner, adapter, session_entry, src = _make_runner_with_adapter()
-    runner._post_turn_goal_continuation(
+    await runner._post_turn_goal_continuation(
        session_entry=session_entry,
        source=src,
        final_response="anything",
@ -207,9 +211,9 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home):
    runner.adapters[Platform.TELEGRAM] = _NoSendAdapter()
-    with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok")):
+    with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)):
        # must not raise
-        runner._post_turn_goal_continuation(
+        await runner._post_turn_goal_continuation(
            session_entry=session_entry,
            source=src,
            final_response="whatever",
--- a/tests/hermes_cli/test_goals.py
+++ b/tests/hermes_cli/test_goals.py
@ -40,14 +40,14 @@ class TestParseJudgeResponse:
    def test_clean_json_done(self):
        from hermes_cli.goals import _parse_judge_response
-        done, reason = _parse_judge_response('{"done": true, "reason": "all good"}')
+        done, reason, _ = _parse_judge_response('{"done": true, "reason": "all good"}')
        assert done is True
        assert reason == "all good"
    def test_clean_json_continue(self):
        from hermes_cli.goals import _parse_judge_response
-        done, reason = _parse_judge_response('{"done": false, "reason": "more work needed"}')
+        done, reason, _ = _parse_judge_response('{"done": false, "reason": "more work needed"}')
        assert done is False
        assert reason == "more work needed"
@ -55,7 +55,7 @@ class TestParseJudgeResponse:
        from hermes_cli.goals import _parse_judge_response
        raw = '```json\n{"done": true, "reason": "done"}\n```'
-        done, reason = _parse_judge_response(raw)
+        done, reason, _ = _parse_judge_response(raw)
        assert done is True
        assert "done" in reason
@ -64,7 +64,7 @@ class TestParseJudgeResponse:
        from hermes_cli.goals import _parse_judge_response
        raw = 'Looking at this... the agent says X. Verdict: {"done": false, "reason": "partial"}'
-        done, reason = _parse_judge_response(raw)
+        done, reason, _ = _parse_judge_response(raw)
        assert done is False
        assert reason == "partial"
@ -72,24 +72,24 @@ class TestParseJudgeResponse:
        from hermes_cli.goals import _parse_judge_response
        for s in ("true", "yes", "done", "1"):
-            done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
+            done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
            assert done is True
        for s in ("false", "no", "not yet"):
-            done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
+            done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
            assert done is False
    def test_malformed_json_fails_open(self):
        """Non-JSON → not done, with error-ish reason (so judge_goal can map to continue)."""
        from hermes_cli.goals import _parse_judge_response
-        done, reason = _parse_judge_response("this is not json at all")
+        done, reason, _ = _parse_judge_response("this is not json at all")
        assert done is False
        assert reason  # non-empty
    def test_empty_response(self):
        from hermes_cli.goals import _parse_judge_response
-        done, reason = _parse_judge_response("")
+        done, reason, _ = _parse_judge_response("")
        assert done is False
        assert reason
@ -103,13 +103,13 @@ class TestJudgeGoal:
    def test_empty_goal_skipped(self):
        from hermes_cli.goals import judge_goal
-        verdict, _ = judge_goal("", "some response")
+        verdict, _, _ = judge_goal("", "some response")
        assert verdict == "skipped"
    def test_empty_response_continues(self):
        from hermes_cli.goals import judge_goal
-        verdict, _ = judge_goal("ship the thing", "")
+        verdict, _, _ = judge_goal("ship the thing", "")
        assert verdict == "continue"
    def test_no_aux_client_continues(self):
@ -120,7 +120,7 @@ class TestJudgeGoal:
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(None, None),
        ):
-            verdict, _ = goals.judge_goal("my goal", "my response")
+            verdict, _, _ = goals.judge_goal("my goal", "my response")
        assert verdict == "continue"
    def test_api_error_continues(self):
@ -133,7 +133,7 @@ class TestJudgeGoal:
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(fake_client, "judge-model"),
        ):
-            verdict, reason = goals.judge_goal("goal", "response")
+            verdict, reason, _ = goals.judge_goal("goal", "response")
        assert verdict == "continue"
        assert "judge error" in reason.lower()
@ -152,7 +152,7 @@ class TestJudgeGoal:
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(fake_client, "judge-model"),
        ):
-            verdict, reason = goals.judge_goal("goal", "agent response")
+            verdict, reason, _ = goals.judge_goal("goal", "agent response")
        assert verdict == "done"
        assert reason == "achieved"
@ -171,7 +171,7 @@ class TestJudgeGoal:
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(fake_client, "judge-model"),
        ):
-            verdict, reason = goals.judge_goal("goal", "agent response")
+            verdict, reason, _ = goals.judge_goal("goal", "agent response")
        assert verdict == "continue"
        assert reason == "not yet"
@ -260,7 +260,7 @@ class TestGoalManager:
        mgr = GoalManager(session_id="eval-sid-1")
        mgr.set("ship it")
-        with patch.object(goals, "judge_goal", return_value=("done", "shipped")):
+        with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)):
            decision = mgr.evaluate_after_turn("I shipped the feature.")
        assert decision["verdict"] == "done"
@ -276,7 +276,7 @@ class TestGoalManager:
        mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
        mgr.set("a long goal")
-        with patch.object(goals, "judge_goal", return_value=("continue", "more work")):
+        with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)):
            decision = mgr.evaluate_after_turn("made some progress")
        assert decision["verdict"] == "continue"
@ -294,7 +294,7 @@ class TestGoalManager:
        mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
        mgr.set("hard goal")
-        with patch.object(goals, "judge_goal", return_value=("continue", "not yet")):
+        with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)):
            d1 = mgr.evaluate_after_turn("step 1")
            assert d1["should_continue"] is True
            assert mgr.state.turns_used == 1
@ -356,3 +356,161 @@ def test_goal_command_dispatches_in_cli_registry_helpers():
    assert "/goal" in COMMANDS
    session_cmds = COMMANDS_BY_CATEGORY.get("Session", {})
    assert "/goal" in session_cmds
 # ──────────────────────────────────────────────────────────────────────
 # Auto-pause on consecutive judge parse failures
 # ──────────────────────────────────────────────────────────────────────
 class TestJudgeParseFailureAutoPause:
    """Regression: weak judge models (e.g. deepseek-v4-flash) that return
    empty strings or non-JSON prose must auto-pause the loop after N turns
    instead of burning the whole turn budget."""
    def test_parse_response_flags_empty_as_parse_failure(self):
        from hermes_cli.goals import _parse_judge_response
        done, reason, parse_failed = _parse_judge_response("")
        assert done is False
        assert parse_failed is True
        assert "empty" in reason.lower()
    def test_parse_response_flags_non_json_as_parse_failure(self):
        from hermes_cli.goals import _parse_judge_response
        done, reason, parse_failed = _parse_judge_response(
            "Let me analyze whether the goal is fully satisfied based on the agent's response..."
        )
        assert done is False
        assert parse_failed is True
        assert "not json" in reason.lower()
    def test_parse_response_clean_json_is_not_parse_failure(self):
        from hermes_cli.goals import _parse_judge_response
        done, _, parse_failed = _parse_judge_response(
            '{"done": false, "reason": "more work"}'
        )
        assert done is False
        assert parse_failed is False
    def test_api_error_does_not_count_as_parse_failure(self):
        """Transient network/API errors must not trip the auto-pause guard."""
        from hermes_cli import goals
        fake_client = MagicMock()
        fake_client.chat.completions.create.side_effect = RuntimeError("connection reset")
        with patch(
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(fake_client, "judge-model"),
        ):
            verdict, _, parse_failed = goals.judge_goal("goal", "response")
        assert verdict == "continue"
        assert parse_failed is False
    def test_empty_judge_reply_flagged_as_parse_failure(self):
        """End-to-end: judge returns empty content → parse_failed=True."""
        from hermes_cli import goals
        fake_client = MagicMock()
        fake_client.chat.completions.create.return_value = MagicMock(
            choices=[MagicMock(message=MagicMock(content=""))]
        )
        with patch(
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(fake_client, "judge-model"),
        ):
            verdict, _, parse_failed = goals.judge_goal("goal", "response")
        assert verdict == "continue"
        assert parse_failed is True
    def test_auto_pause_after_three_consecutive_parse_failures(self, hermes_home):
        """N=3 consecutive parse failures → auto-pause with config pointer."""
        from hermes_cli import goals
        from hermes_cli.goals import GoalManager, DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES
        assert DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES == 3
        mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20)
        mgr.set("do a thing")
        with patch.object(
            goals, "judge_goal", return_value=("continue", "judge returned empty response", True)
        ):
            d1 = mgr.evaluate_after_turn("step 1")
            assert d1["should_continue"] is True
            assert mgr.state.consecutive_parse_failures == 1
            d2 = mgr.evaluate_after_turn("step 2")
            assert d2["should_continue"] is True
            assert mgr.state.consecutive_parse_failures == 2
            d3 = mgr.evaluate_after_turn("step 3")
            assert d3["should_continue"] is False
            assert d3["status"] == "paused"
            assert mgr.state.consecutive_parse_failures == 3
            # Message points at the config surface so the user can fix it.
            assert "auxiliary" in d3["message"]
            assert "goal_judge" in d3["message"]
            assert "config.yaml" in d3["message"]
    def test_parse_failure_counter_resets_on_good_reply(self, hermes_home):
        """A single good judge reply resets the counter — transient flakes don't pause."""
        from hermes_cli import goals
        from hermes_cli.goals import GoalManager
        mgr = GoalManager(session_id="parse-fail-sid-2", default_max_turns=20)
        mgr.set("another goal")
        # Two parse failures…
        with patch.object(
            goals, "judge_goal", return_value=("continue", "not json", True)
        ):
            mgr.evaluate_after_turn("step 1")
            mgr.evaluate_after_turn("step 2")
            assert mgr.state.consecutive_parse_failures == 2
        # …then one clean reply resets the counter.
        with patch.object(
            goals, "judge_goal", return_value=("continue", "making progress", False)
        ):
            d = mgr.evaluate_after_turn("step 3")
            assert d["should_continue"] is True
            assert mgr.state.consecutive_parse_failures == 0
    def test_parse_failure_counter_not_incremented_by_api_errors(self, hermes_home):
        """API/transport errors must NOT count toward the auto-pause threshold."""
        from hermes_cli import goals
        from hermes_cli.goals import GoalManager
        mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20)
        mgr.set("goal")
        with patch.object(
            goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False)
        ):
            for _ in range(5):
                d = mgr.evaluate_after_turn("still going")
                assert d["should_continue"] is True
            assert mgr.state.consecutive_parse_failures == 0
            assert mgr.state.status == "active"
    def test_consecutive_parse_failures_persists_across_goalmanager_reloads(
        self, hermes_home
    ):
        """The counter must be durable so cross-session resumes see it."""
        from hermes_cli import goals
        from hermes_cli.goals import GoalManager, load_goal
        mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20)
        mgr.set("persistent goal")
        with patch.object(
            goals, "judge_goal", return_value=("continue", "empty", True)
        ):
            mgr.evaluate_after_turn("r")
            mgr.evaluate_after_turn("r")
        reloaded = load_goal("parse-fail-sid-4")
        assert reloaded is not None
        assert reloaded.consecutive_parse_failures == 2