fix(goals): auto-pause when judge model returns unparseable output

Weak judge models (e.g. deepseek-v4-flash) return empty strings or prose when asked for the strict {done, reason} JSON verdict. The old code failed-open to continue on every such turn, burning the entire turn budget with log lines like judge returned empty response judge reply was not JSON: "Let me analyze whether the goal..." and /goal clear could not stop it mid-loop without /stop. After N=3 consecutive *parse* failures (transport/API errors don't count — those are transient), the loop auto-pauses and prints: ⏸ Goal paused — the judge model (3 turns) isn't returning the required JSON verdict. Route the judge to a stricter model in ~/.hermes/config.yaml: auxiliary: goal_judge: provider: openrouter model: google/gemini-3-flash-preview Then /goal resume to continue. The counter resets on any usable reply (both "done"/"continue" and API errors) and persists across GoalManager reloads so cross-session resumes carry the correct state. Also fixes test_goal_verdict_send.py sharing a hardcoded session_id across tests — the shared id only worked because the previous _post_turn_goal_continuation was a never-awaited coroutine. Now that PR #19160 made it properly awaited, the xdist test-leakage bug surfaced. Each test gets a unique session_id via uuid suffix.
2026-05-08 03:01:47 +00:00 · 2026-05-07 17:19:47 -07:00 · 2026-05-07 17:19:47 -07:00 · 307c85e5c1
commit 307c85e5c1
parent 03ddff8897
4 changed files with 270 additions and 49 deletions
--- a/hermes_cli/goals.py
+++ b/hermes_cli/goals.py
@ -47,6 +47,14 @@ DEFAULT_MAX_TURNS = 20
 DEFAULT_JUDGE_TIMEOUT = 30.0
 # Cap how much of the last response + recent messages we send to the judge.
 _JUDGE_RESPONSE_SNIPPET_CHARS = 4000
+# After this many consecutive judge *parse* failures (empty output / non-JSON),
+# the loop auto-pauses and points the user at the goal_judge config. API /
+# transport errors do NOT count toward this — those are transient. This guards
+# against small models (e.g. deepseek-v4-flash) that cannot follow the strict
+# JSON reply contract; without it the loop runs until the turn budget is
+# exhausted with every reply shaped like `judge returned empty response` or
+# `judge reply was not JSON`.
+DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3


 CONTINUATION_PROMPT_TEMPLATE = (
@ -99,6 +107,7 @@ class GoalState:
    last_verdict: Optional[str] = None        # "done" | "continue" | "skipped"
    last_reason: Optional[str] = None
    paused_reason: Optional[str] = None       # why we auto-paused (budget, etc.)
+    consecutive_parse_failures: int = 0       # judge-output parse failures in a row

    def to_json(self) -> str:
        return json.dumps(asdict(self), ensure_ascii=False)
@ -116,6 +125,7 @@ class GoalState:
            last_verdict=data.get("last_verdict"),
            last_reason=data.get("last_reason"),
            paused_reason=data.get("paused_reason"),
+            consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0),
        )


@ -220,13 +230,17 @@ def _truncate(text: str, limit: int) -> str:
 _JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)


-def _parse_judge_response(raw: str) -> Tuple[bool, str]:
-    """Parse the judge's reply. Fail-open to ``(False, "<reason>")``.
+def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
+    """Parse the judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.

-    Returns ``(done, reason)``.
+    Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the
+    judge returned output that couldn't be interpreted as the expected JSON
+    verdict (empty body, prose, malformed JSON). Callers use that flag to
+    auto-pause after N consecutive parse failures so a weak judge model
+    doesn't silently burn the turn budget.
    """
    if not raw:
-        return False, "judge returned empty response"
+        return False, "judge returned empty response", True

    text = raw.strip()

@ -252,7 +266,7 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str]:
                data = None

    if not isinstance(data, dict):
-        return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}"
+        return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True

    done_val = data.get("done")
    if isinstance(done_val, str):
@ -262,7 +276,7 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str]:
    reason = str(data.get("reason") or "").strip()
    if not reason:
        reason = "no reason provided"
-    return done, reason
+    return done, reason, False


 def judge_goal(
@ -270,36 +284,42 @@ def judge_goal(
    last_response: str,
    *,
    timeout: float = DEFAULT_JUDGE_TIMEOUT,
-) -> Tuple[str, str]:
+) -> Tuple[str, str, bool]:
    """Ask the auxiliary model whether the goal is satisfied.

-    Returns ``(verdict, reason)`` where verdict is ``"done"``, ``"continue"``,
-    or ``"skipped"`` (when the judge couldn't be reached).
+    Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``,
+    ``"continue"``, or ``"skipped"`` (when the judge couldn't be reached).

-    This is deliberately fail-open: any error returns ``("continue", "...")``
-    so a broken judge doesn't wedge progress — the turn budget is the
-    backstop.
+    ``parse_failed`` is True only when the judge call succeeded but its output
+    was unusable (empty or non-JSON). API/transport errors return False — they
+    are transient and should fail-open silently. Callers use this flag to
+    auto-pause after N consecutive parse failures (see
+    ``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``).
+
+    This is deliberately fail-open: any error returns ``("continue", "...", False)``
+    so a broken judge doesn't wedge progress — the turn budget and the
+    consecutive-parse-failures auto-pause are the backstops.
    """
    if not goal.strip():
-        return "skipped", "empty goal"
+        return "skipped", "empty goal", False
    if not last_response.strip():
        # No substantive reply this turn — almost certainly not done yet.
-        return "continue", "empty response (nothing to evaluate)"
+        return "continue", "empty response (nothing to evaluate)", False

    try:
        from agent.auxiliary_client import get_text_auxiliary_client
    except Exception as exc:
        logger.debug("goal judge: auxiliary client import failed: %s", exc)
-        return "continue", "auxiliary client unavailable"
+        return "continue", "auxiliary client unavailable", False

    try:
        client, model = get_text_auxiliary_client("goal_judge")
    except Exception as exc:
        logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc)
-        return "continue", "auxiliary client unavailable"
+        return "continue", "auxiliary client unavailable", False

    if client is None or not model:
-        return "continue", "no auxiliary client configured"
+        return "continue", "no auxiliary client configured", False

    prompt = JUDGE_USER_PROMPT_TEMPLATE.format(
        goal=_truncate(goal, 2000),
@ -319,17 +339,17 @@ def judge_goal(
        )
    except Exception as exc:
        logger.info("goal judge: API call failed (%s) — falling through to continue", exc)
-        return "continue", f"judge error: {type(exc).__name__}"
+        return "continue", f"judge error: {type(exc).__name__}", False

    try:
        raw = resp.choices[0].message.content or ""
    except Exception:
        raw = ""

-    done, reason = _parse_judge_response(raw)
+    done, reason, parse_failed = _parse_judge_response(raw)
    verdict = "done" if done else "continue"
    logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120))
-    return verdict, reason
+    return verdict, reason, parse_failed


 # ──────────────────────────────────────────────────────────────────────
@ -473,10 +493,18 @@ class GoalManager:
        state.turns_used += 1
        state.last_turn_at = time.time()

-        verdict, reason = judge_goal(state.goal, last_response)
+        verdict, reason, parse_failed = judge_goal(state.goal, last_response)
        state.last_verdict = verdict
        state.last_reason = reason

+        # Track consecutive judge parse failures. Reset on any usable reply,
+        # including API / transport errors (parse_failed=False) so a flaky
+        # network doesn't trip the auto-pause meant for bad judge models.
+        if parse_failed:
+            state.consecutive_parse_failures += 1
+        else:
+            state.consecutive_parse_failures = 0
+
        if verdict == "done":
            state.status = "done"
            save_goal(self.session_id, state)
@ -489,6 +517,36 @@ class GoalManager:
                "message": f"✓ Goal achieved: {reason}",
            }

+        # Auto-pause when the judge model can't produce the expected JSON
+        # verdict N turns in a row. Points the user at the goal_judge config
+        # so they can route this side task to a model that follows the
+        # contract (e.g. google/gemini-3-flash-preview). Without this guard,
+        # weak judge models burn the entire turn budget returning prose or
+        # empty strings.
+        if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES:
+            state.status = "paused"
+            state.paused_reason = (
+                f"judge model returned unparseable output {state.consecutive_parse_failures} turns in a row"
+            )
+            save_goal(self.session_id, state)
+            return {
+                "status": "paused",
+                "should_continue": False,
+                "continuation_prompt": None,
+                "verdict": "continue",
+                "reason": reason,
+                "message": (
+                    f"⏸ Goal paused — the judge model ({state.consecutive_parse_failures} turns) "
+                    "isn't returning the required JSON verdict. Route the judge to a stricter "
+                    "model in ~/.hermes/config.yaml:\n"
+                    "  auxiliary:\n"
+                    "    goal_judge:\n"
+                    "      provider: openrouter\n"
+                    "      model: google/gemini-3-flash-preview\n"
+                    "Then /goal resume to continue."
+                ),
+            }
+
        if state.turns_used >= state.max_turns:
            state.status = "paused"
            state.paused_reason = f"turn budget exhausted ({state.turns_used}/{state.max_turns})"
--- a/scripts/release.py
+++ b/scripts/release.py
@ -58,6 +58,7 @@ AUTHOR_MAP = {
    "223003280+Abd0r@users.noreply.github.com": "Abd0r",
    "abdielv@proton.me": "AJV20",
    "mason@growagainorchids.com": "masonjames",
+    "ytchen0719@gmail.com": "liquidchen",
    "am@studio1.tailb672fe.ts.net": "subtract0",
    "axmaiqiu@gmail.com": "qWaitCrypto",
    "159539633+MottledShadow@users.noreply.github.com": "MottledShadow",
--- a/tests/gateway/test_goal_verdict_send.py
+++ b/tests/gateway/test_goal_verdict_send.py
@ -61,8 +61,9 @@ class _RecordingAdapter:
        return _R()


-def _make_runner_with_adapter():
+def _make_runner_with_adapter(session_id: str = None):
    from gateway.run import GatewayRunner
+    import uuid

    runner = object.__new__(GatewayRunner)
    runner.config = GatewayConfig(
@ -74,9 +75,12 @@ def _make_runner_with_adapter():
    runner._queued_events = {}

    src = _make_source()
+    # Default to a unique session_id so xdist parallel runs on the same worker
+    # don't see each other's GoalManager state (DEFAULT_DB_PATH gets frozen at
+    # module-import time, defeating per-test HERMES_HOME monkeypatches).
    session_entry = SessionEntry(
        session_key=build_session_key(src),
-        session_id="goal-sess-1",
+        session_id=session_id or f"goal-sess-{uuid.uuid4().hex[:8]}",
        created_at=datetime.now(),
        updated_at=datetime.now(),
        platform=Platform.TELEGRAM,
@ -103,8 +107,8 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home):
    mgr = GoalManager(session_entry.session_id)
    mgr.set("ship the feature")

-    with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped")):
-        runner._post_turn_goal_continuation(
+    with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)):
+        await runner._post_turn_goal_continuation(
            session_entry=session_entry,
            source=src,
            final_response="I shipped the feature.",
@ -132,8 +136,8 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home):
    mgr = GoalManager(session_entry.session_id)
    mgr.set("polish the docs")

-    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work")):
-        runner._post_turn_goal_continuation(
+    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)):
+        await runner._post_turn_goal_continuation(
            session_entry=session_entry,
            source=src,
            final_response="here's a partial edit",
@ -160,8 +164,8 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home):
    state.turns_used = 2
    save_goal(session_entry.session_id, state)

-    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going")):
-        runner._post_turn_goal_continuation(
+    with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)):
+        await runner._post_turn_goal_continuation(
            session_entry=session_entry,
            source=src,
            final_response="still partial",
@ -181,7 +185,7 @@ async def test_goal_verdict_skipped_when_no_active_goal(hermes_home):
    """No goal set → the hook is a no-op. Nothing is sent, nothing enqueued."""
    runner, adapter, session_entry, src = _make_runner_with_adapter()

-    runner._post_turn_goal_continuation(
+    await runner._post_turn_goal_continuation(
        session_entry=session_entry,
        source=src,
        final_response="anything",
@ -207,9 +211,9 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home):

    runner.adapters[Platform.TELEGRAM] = _NoSendAdapter()

-    with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok")):
+    with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)):
        # must not raise
-        runner._post_turn_goal_continuation(
+        await runner._post_turn_goal_continuation(
            session_entry=session_entry,
            source=src,
            final_response="whatever",
--- a/tests/hermes_cli/test_goals.py
+++ b/tests/hermes_cli/test_goals.py
@ -40,14 +40,14 @@ class TestParseJudgeResponse:
    def test_clean_json_done(self):
        from hermes_cli.goals import _parse_judge_response

-        done, reason = _parse_judge_response('{"done": true, "reason": "all good"}')
+        done, reason, _ = _parse_judge_response('{"done": true, "reason": "all good"}')
        assert done is True
        assert reason == "all good"

    def test_clean_json_continue(self):
        from hermes_cli.goals import _parse_judge_response

-        done, reason = _parse_judge_response('{"done": false, "reason": "more work needed"}')
+        done, reason, _ = _parse_judge_response('{"done": false, "reason": "more work needed"}')
        assert done is False
        assert reason == "more work needed"

@ -55,7 +55,7 @@ class TestParseJudgeResponse:
        from hermes_cli.goals import _parse_judge_response

        raw = '```json\n{"done": true, "reason": "done"}\n```'
-        done, reason = _parse_judge_response(raw)
+        done, reason, _ = _parse_judge_response(raw)
        assert done is True
        assert "done" in reason

@ -64,7 +64,7 @@ class TestParseJudgeResponse:
        from hermes_cli.goals import _parse_judge_response

        raw = 'Looking at this... the agent says X. Verdict: {"done": false, "reason": "partial"}'
-        done, reason = _parse_judge_response(raw)
+        done, reason, _ = _parse_judge_response(raw)
        assert done is False
        assert reason == "partial"

@ -72,24 +72,24 @@ class TestParseJudgeResponse:
        from hermes_cli.goals import _parse_judge_response

        for s in ("true", "yes", "done", "1"):
-            done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
+            done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
            assert done is True
        for s in ("false", "no", "not yet"):
-            done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
+            done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
            assert done is False

    def test_malformed_json_fails_open(self):
        """Non-JSON → not done, with error-ish reason (so judge_goal can map to continue)."""
        from hermes_cli.goals import _parse_judge_response

-        done, reason = _parse_judge_response("this is not json at all")
+        done, reason, _ = _parse_judge_response("this is not json at all")
        assert done is False
        assert reason  # non-empty

    def test_empty_response(self):
        from hermes_cli.goals import _parse_judge_response

-        done, reason = _parse_judge_response("")
+        done, reason, _ = _parse_judge_response("")
        assert done is False
        assert reason

@ -103,13 +103,13 @@ class TestJudgeGoal:
    def test_empty_goal_skipped(self):
        from hermes_cli.goals import judge_goal

-        verdict, _ = judge_goal("", "some response")
+        verdict, _, _ = judge_goal("", "some response")
        assert verdict == "skipped"

    def test_empty_response_continues(self):
        from hermes_cli.goals import judge_goal

-        verdict, _ = judge_goal("ship the thing", "")
+        verdict, _, _ = judge_goal("ship the thing", "")
        assert verdict == "continue"

    def test_no_aux_client_continues(self):
@ -120,7 +120,7 @@ class TestJudgeGoal:
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(None, None),
        ):
-            verdict, _ = goals.judge_goal("my goal", "my response")
+            verdict, _, _ = goals.judge_goal("my goal", "my response")
        assert verdict == "continue"

    def test_api_error_continues(self):
@ -133,7 +133,7 @@ class TestJudgeGoal:
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(fake_client, "judge-model"),
        ):
-            verdict, reason = goals.judge_goal("goal", "response")
+            verdict, reason, _ = goals.judge_goal("goal", "response")
        assert verdict == "continue"
        assert "judge error" in reason.lower()

@ -152,7 +152,7 @@ class TestJudgeGoal:
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(fake_client, "judge-model"),
        ):
-            verdict, reason = goals.judge_goal("goal", "agent response")
+            verdict, reason, _ = goals.judge_goal("goal", "agent response")
        assert verdict == "done"
        assert reason == "achieved"

@ -171,7 +171,7 @@ class TestJudgeGoal:
            "agent.auxiliary_client.get_text_auxiliary_client",
            return_value=(fake_client, "judge-model"),
        ):
-            verdict, reason = goals.judge_goal("goal", "agent response")
+            verdict, reason, _ = goals.judge_goal("goal", "agent response")
        assert verdict == "continue"
        assert reason == "not yet"

@ -260,7 +260,7 @@ class TestGoalManager:
        mgr = GoalManager(session_id="eval-sid-1")
        mgr.set("ship it")

-        with patch.object(goals, "judge_goal", return_value=("done", "shipped")):
+        with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)):
            decision = mgr.evaluate_after_turn("I shipped the feature.")

        assert decision["verdict"] == "done"
@ -276,7 +276,7 @@ class TestGoalManager:
        mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
        mgr.set("a long goal")

-        with patch.object(goals, "judge_goal", return_value=("continue", "more work")):
+        with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)):
            decision = mgr.evaluate_after_turn("made some progress")

        assert decision["verdict"] == "continue"
@ -294,7 +294,7 @@ class TestGoalManager:
        mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
        mgr.set("hard goal")

-        with patch.object(goals, "judge_goal", return_value=("continue", "not yet")):
+        with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)):
            d1 = mgr.evaluate_after_turn("step 1")
            assert d1["should_continue"] is True
            assert mgr.state.turns_used == 1
@ -356,3 +356,161 @@ def test_goal_command_dispatches_in_cli_registry_helpers():
    assert "/goal" in COMMANDS
    session_cmds = COMMANDS_BY_CATEGORY.get("Session", {})
    assert "/goal" in session_cmds
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Auto-pause on consecutive judge parse failures
+# ──────────────────────────────────────────────────────────────────────
+
+
+class TestJudgeParseFailureAutoPause:
+    """Regression: weak judge models (e.g. deepseek-v4-flash) that return
+    empty strings or non-JSON prose must auto-pause the loop after N turns
+    instead of burning the whole turn budget."""
+
+    def test_parse_response_flags_empty_as_parse_failure(self):
+        from hermes_cli.goals import _parse_judge_response
+
+        done, reason, parse_failed = _parse_judge_response("")
+        assert done is False
+        assert parse_failed is True
+        assert "empty" in reason.lower()
+
+    def test_parse_response_flags_non_json_as_parse_failure(self):
+        from hermes_cli.goals import _parse_judge_response
+
+        done, reason, parse_failed = _parse_judge_response(
+            "Let me analyze whether the goal is fully satisfied based on the agent's response..."
+        )
+        assert done is False
+        assert parse_failed is True
+        assert "not json" in reason.lower()
+
+    def test_parse_response_clean_json_is_not_parse_failure(self):
+        from hermes_cli.goals import _parse_judge_response
+
+        done, _, parse_failed = _parse_judge_response(
+            '{"done": false, "reason": "more work"}'
+        )
+        assert done is False
+        assert parse_failed is False
+
+    def test_api_error_does_not_count_as_parse_failure(self):
+        """Transient network/API errors must not trip the auto-pause guard."""
+        from hermes_cli import goals
+
+        fake_client = MagicMock()
+        fake_client.chat.completions.create.side_effect = RuntimeError("connection reset")
+        with patch(
+            "agent.auxiliary_client.get_text_auxiliary_client",
+            return_value=(fake_client, "judge-model"),
+        ):
+            verdict, _, parse_failed = goals.judge_goal("goal", "response")
+        assert verdict == "continue"
+        assert parse_failed is False
+
+    def test_empty_judge_reply_flagged_as_parse_failure(self):
+        """End-to-end: judge returns empty content → parse_failed=True."""
+        from hermes_cli import goals
+
+        fake_client = MagicMock()
+        fake_client.chat.completions.create.return_value = MagicMock(
+            choices=[MagicMock(message=MagicMock(content=""))]
+        )
+        with patch(
+            "agent.auxiliary_client.get_text_auxiliary_client",
+            return_value=(fake_client, "judge-model"),
+        ):
+            verdict, _, parse_failed = goals.judge_goal("goal", "response")
+        assert verdict == "continue"
+        assert parse_failed is True
+
+    def test_auto_pause_after_three_consecutive_parse_failures(self, hermes_home):
+        """N=3 consecutive parse failures → auto-pause with config pointer."""
+        from hermes_cli import goals
+        from hermes_cli.goals import GoalManager, DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES
+
+        assert DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES == 3
+        mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20)
+        mgr.set("do a thing")
+
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "judge returned empty response", True)
+        ):
+            d1 = mgr.evaluate_after_turn("step 1")
+            assert d1["should_continue"] is True
+            assert mgr.state.consecutive_parse_failures == 1
+
+            d2 = mgr.evaluate_after_turn("step 2")
+            assert d2["should_continue"] is True
+            assert mgr.state.consecutive_parse_failures == 2
+
+            d3 = mgr.evaluate_after_turn("step 3")
+            assert d3["should_continue"] is False
+            assert d3["status"] == "paused"
+            assert mgr.state.consecutive_parse_failures == 3
+            # Message points at the config surface so the user can fix it.
+            assert "auxiliary" in d3["message"]
+            assert "goal_judge" in d3["message"]
+            assert "config.yaml" in d3["message"]
+
+    def test_parse_failure_counter_resets_on_good_reply(self, hermes_home):
+        """A single good judge reply resets the counter — transient flakes don't pause."""
+        from hermes_cli import goals
+        from hermes_cli.goals import GoalManager
+
+        mgr = GoalManager(session_id="parse-fail-sid-2", default_max_turns=20)
+        mgr.set("another goal")
+
+        # Two parse failures…
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "not json", True)
+        ):
+            mgr.evaluate_after_turn("step 1")
+            mgr.evaluate_after_turn("step 2")
+            assert mgr.state.consecutive_parse_failures == 2
+
+        # …then one clean reply resets the counter.
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "making progress", False)
+        ):
+            d = mgr.evaluate_after_turn("step 3")
+            assert d["should_continue"] is True
+            assert mgr.state.consecutive_parse_failures == 0
+
+    def test_parse_failure_counter_not_incremented_by_api_errors(self, hermes_home):
+        """API/transport errors must NOT count toward the auto-pause threshold."""
+        from hermes_cli import goals
+        from hermes_cli.goals import GoalManager
+
+        mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20)
+        mgr.set("goal")
+
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False)
+        ):
+            for _ in range(5):
+                d = mgr.evaluate_after_turn("still going")
+                assert d["should_continue"] is True
+            assert mgr.state.consecutive_parse_failures == 0
+            assert mgr.state.status == "active"
+
+    def test_consecutive_parse_failures_persists_across_goalmanager_reloads(
+        self, hermes_home
+    ):
+        """The counter must be durable so cross-session resumes see it."""
+        from hermes_cli import goals
+        from hermes_cli.goals import GoalManager, load_goal
+
+        mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20)
+        mgr.set("persistent goal")
+
+        with patch.object(
+            goals, "judge_goal", return_value=("continue", "empty", True)
+        ):
+            mgr.evaluate_after_turn("r")
+            mgr.evaluate_after_turn("r")
+
+        reloaded = load_goal("parse-fail-sid-4")
+        assert reloaded is not None
+        assert reloaded.consecutive_parse_failures == 2