mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix(goals): auto-pause when judge model returns unparseable output
Weak judge models (e.g. deepseek-v4-flash) return empty strings or prose
when asked for the strict {done, reason} JSON verdict. The old code
failed-open to continue on every such turn, burning the entire turn
budget with log lines like
judge returned empty response
judge reply was not JSON: "Let me analyze whether the goal..."
and /goal clear could not stop it mid-loop without /stop.
After N=3 consecutive *parse* failures (transport/API errors don't
count — those are transient), the loop auto-pauses and prints:
⏸ Goal paused — the judge model (3 turns) isn't returning the
required JSON verdict. Route the judge to a stricter model in
~/.hermes/config.yaml:
auxiliary:
goal_judge:
provider: openrouter
model: google/gemini-3-flash-preview
Then /goal resume to continue.
The counter resets on any usable reply (both "done"/"continue" and
API errors) and persists across GoalManager reloads so cross-session
resumes carry the correct state.
Also fixes test_goal_verdict_send.py sharing a hardcoded session_id
across tests — the shared id only worked because the previous
_post_turn_goal_continuation was a never-awaited coroutine. Now that
PR #19160 made it properly awaited, the xdist test-leakage bug
surfaced. Each test gets a unique session_id via uuid suffix.
This commit is contained in:
parent
03ddff8897
commit
307c85e5c1
4 changed files with 270 additions and 49 deletions
|
|
@ -47,6 +47,14 @@ DEFAULT_MAX_TURNS = 20
|
||||||
DEFAULT_JUDGE_TIMEOUT = 30.0
|
DEFAULT_JUDGE_TIMEOUT = 30.0
|
||||||
# Cap how much of the last response + recent messages we send to the judge.
|
# Cap how much of the last response + recent messages we send to the judge.
|
||||||
_JUDGE_RESPONSE_SNIPPET_CHARS = 4000
|
_JUDGE_RESPONSE_SNIPPET_CHARS = 4000
|
||||||
|
# After this many consecutive judge *parse* failures (empty output / non-JSON),
|
||||||
|
# the loop auto-pauses and points the user at the goal_judge config. API /
|
||||||
|
# transport errors do NOT count toward this — those are transient. This guards
|
||||||
|
# against small models (e.g. deepseek-v4-flash) that cannot follow the strict
|
||||||
|
# JSON reply contract; without it the loop runs until the turn budget is
|
||||||
|
# exhausted with every reply shaped like `judge returned empty response` or
|
||||||
|
# `judge reply was not JSON`.
|
||||||
|
DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES = 3
|
||||||
|
|
||||||
|
|
||||||
CONTINUATION_PROMPT_TEMPLATE = (
|
CONTINUATION_PROMPT_TEMPLATE = (
|
||||||
|
|
@ -99,6 +107,7 @@ class GoalState:
|
||||||
last_verdict: Optional[str] = None # "done" | "continue" | "skipped"
|
last_verdict: Optional[str] = None # "done" | "continue" | "skipped"
|
||||||
last_reason: Optional[str] = None
|
last_reason: Optional[str] = None
|
||||||
paused_reason: Optional[str] = None # why we auto-paused (budget, etc.)
|
paused_reason: Optional[str] = None # why we auto-paused (budget, etc.)
|
||||||
|
consecutive_parse_failures: int = 0 # judge-output parse failures in a row
|
||||||
|
|
||||||
def to_json(self) -> str:
|
def to_json(self) -> str:
|
||||||
return json.dumps(asdict(self), ensure_ascii=False)
|
return json.dumps(asdict(self), ensure_ascii=False)
|
||||||
|
|
@ -116,6 +125,7 @@ class GoalState:
|
||||||
last_verdict=data.get("last_verdict"),
|
last_verdict=data.get("last_verdict"),
|
||||||
last_reason=data.get("last_reason"),
|
last_reason=data.get("last_reason"),
|
||||||
paused_reason=data.get("paused_reason"),
|
paused_reason=data.get("paused_reason"),
|
||||||
|
consecutive_parse_failures=int(data.get("consecutive_parse_failures", 0) or 0),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -220,13 +230,17 @@ def _truncate(text: str, limit: int) -> str:
|
||||||
_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)
|
_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
def _parse_judge_response(raw: str) -> Tuple[bool, str]:
|
def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
|
||||||
"""Parse the judge's reply. Fail-open to ``(False, "<reason>")``.
|
"""Parse the judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.
|
||||||
|
|
||||||
Returns ``(done, reason)``.
|
Returns ``(done, reason, parse_failed)``. ``parse_failed`` is True when the
|
||||||
|
judge returned output that couldn't be interpreted as the expected JSON
|
||||||
|
verdict (empty body, prose, malformed JSON). Callers use that flag to
|
||||||
|
auto-pause after N consecutive parse failures so a weak judge model
|
||||||
|
doesn't silently burn the turn budget.
|
||||||
"""
|
"""
|
||||||
if not raw:
|
if not raw:
|
||||||
return False, "judge returned empty response"
|
return False, "judge returned empty response", True
|
||||||
|
|
||||||
text = raw.strip()
|
text = raw.strip()
|
||||||
|
|
||||||
|
|
@ -252,7 +266,7 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str]:
|
||||||
data = None
|
data = None
|
||||||
|
|
||||||
if not isinstance(data, dict):
|
if not isinstance(data, dict):
|
||||||
return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}"
|
return False, f"judge reply was not JSON: {_truncate(raw, 200)!r}", True
|
||||||
|
|
||||||
done_val = data.get("done")
|
done_val = data.get("done")
|
||||||
if isinstance(done_val, str):
|
if isinstance(done_val, str):
|
||||||
|
|
@ -262,7 +276,7 @@ def _parse_judge_response(raw: str) -> Tuple[bool, str]:
|
||||||
reason = str(data.get("reason") or "").strip()
|
reason = str(data.get("reason") or "").strip()
|
||||||
if not reason:
|
if not reason:
|
||||||
reason = "no reason provided"
|
reason = "no reason provided"
|
||||||
return done, reason
|
return done, reason, False
|
||||||
|
|
||||||
|
|
||||||
def judge_goal(
|
def judge_goal(
|
||||||
|
|
@ -270,36 +284,42 @@ def judge_goal(
|
||||||
last_response: str,
|
last_response: str,
|
||||||
*,
|
*,
|
||||||
timeout: float = DEFAULT_JUDGE_TIMEOUT,
|
timeout: float = DEFAULT_JUDGE_TIMEOUT,
|
||||||
) -> Tuple[str, str]:
|
) -> Tuple[str, str, bool]:
|
||||||
"""Ask the auxiliary model whether the goal is satisfied.
|
"""Ask the auxiliary model whether the goal is satisfied.
|
||||||
|
|
||||||
Returns ``(verdict, reason)`` where verdict is ``"done"``, ``"continue"``,
|
Returns ``(verdict, reason, parse_failed)`` where verdict is ``"done"``,
|
||||||
or ``"skipped"`` (when the judge couldn't be reached).
|
``"continue"``, or ``"skipped"`` (when the judge couldn't be reached).
|
||||||
|
|
||||||
This is deliberately fail-open: any error returns ``("continue", "...")``
|
``parse_failed`` is True only when the judge call succeeded but its output
|
||||||
so a broken judge doesn't wedge progress — the turn budget is the
|
was unusable (empty or non-JSON). API/transport errors return False — they
|
||||||
backstop.
|
are transient and should fail-open silently. Callers use this flag to
|
||||||
|
auto-pause after N consecutive parse failures (see
|
||||||
|
``DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES``).
|
||||||
|
|
||||||
|
This is deliberately fail-open: any error returns ``("continue", "...", False)``
|
||||||
|
so a broken judge doesn't wedge progress — the turn budget and the
|
||||||
|
consecutive-parse-failures auto-pause are the backstops.
|
||||||
"""
|
"""
|
||||||
if not goal.strip():
|
if not goal.strip():
|
||||||
return "skipped", "empty goal"
|
return "skipped", "empty goal", False
|
||||||
if not last_response.strip():
|
if not last_response.strip():
|
||||||
# No substantive reply this turn — almost certainly not done yet.
|
# No substantive reply this turn — almost certainly not done yet.
|
||||||
return "continue", "empty response (nothing to evaluate)"
|
return "continue", "empty response (nothing to evaluate)", False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from agent.auxiliary_client import get_text_auxiliary_client
|
from agent.auxiliary_client import get_text_auxiliary_client
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("goal judge: auxiliary client import failed: %s", exc)
|
logger.debug("goal judge: auxiliary client import failed: %s", exc)
|
||||||
return "continue", "auxiliary client unavailable"
|
return "continue", "auxiliary client unavailable", False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
client, model = get_text_auxiliary_client("goal_judge")
|
client, model = get_text_auxiliary_client("goal_judge")
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc)
|
logger.debug("goal judge: get_text_auxiliary_client failed: %s", exc)
|
||||||
return "continue", "auxiliary client unavailable"
|
return "continue", "auxiliary client unavailable", False
|
||||||
|
|
||||||
if client is None or not model:
|
if client is None or not model:
|
||||||
return "continue", "no auxiliary client configured"
|
return "continue", "no auxiliary client configured", False
|
||||||
|
|
||||||
prompt = JUDGE_USER_PROMPT_TEMPLATE.format(
|
prompt = JUDGE_USER_PROMPT_TEMPLATE.format(
|
||||||
goal=_truncate(goal, 2000),
|
goal=_truncate(goal, 2000),
|
||||||
|
|
@ -319,17 +339,17 @@ def judge_goal(
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.info("goal judge: API call failed (%s) — falling through to continue", exc)
|
logger.info("goal judge: API call failed (%s) — falling through to continue", exc)
|
||||||
return "continue", f"judge error: {type(exc).__name__}"
|
return "continue", f"judge error: {type(exc).__name__}", False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
raw = resp.choices[0].message.content or ""
|
raw = resp.choices[0].message.content or ""
|
||||||
except Exception:
|
except Exception:
|
||||||
raw = ""
|
raw = ""
|
||||||
|
|
||||||
done, reason = _parse_judge_response(raw)
|
done, reason, parse_failed = _parse_judge_response(raw)
|
||||||
verdict = "done" if done else "continue"
|
verdict = "done" if done else "continue"
|
||||||
logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120))
|
logger.info("goal judge: verdict=%s reason=%s", verdict, _truncate(reason, 120))
|
||||||
return verdict, reason
|
return verdict, reason, parse_failed
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
@ -473,10 +493,18 @@ class GoalManager:
|
||||||
state.turns_used += 1
|
state.turns_used += 1
|
||||||
state.last_turn_at = time.time()
|
state.last_turn_at = time.time()
|
||||||
|
|
||||||
verdict, reason = judge_goal(state.goal, last_response)
|
verdict, reason, parse_failed = judge_goal(state.goal, last_response)
|
||||||
state.last_verdict = verdict
|
state.last_verdict = verdict
|
||||||
state.last_reason = reason
|
state.last_reason = reason
|
||||||
|
|
||||||
|
# Track consecutive judge parse failures. Reset on any usable reply,
|
||||||
|
# including API / transport errors (parse_failed=False) so a flaky
|
||||||
|
# network doesn't trip the auto-pause meant for bad judge models.
|
||||||
|
if parse_failed:
|
||||||
|
state.consecutive_parse_failures += 1
|
||||||
|
else:
|
||||||
|
state.consecutive_parse_failures = 0
|
||||||
|
|
||||||
if verdict == "done":
|
if verdict == "done":
|
||||||
state.status = "done"
|
state.status = "done"
|
||||||
save_goal(self.session_id, state)
|
save_goal(self.session_id, state)
|
||||||
|
|
@ -489,6 +517,36 @@ class GoalManager:
|
||||||
"message": f"✓ Goal achieved: {reason}",
|
"message": f"✓ Goal achieved: {reason}",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Auto-pause when the judge model can't produce the expected JSON
|
||||||
|
# verdict N turns in a row. Points the user at the goal_judge config
|
||||||
|
# so they can route this side task to a model that follows the
|
||||||
|
# contract (e.g. google/gemini-3-flash-preview). Without this guard,
|
||||||
|
# weak judge models burn the entire turn budget returning prose or
|
||||||
|
# empty strings.
|
||||||
|
if state.consecutive_parse_failures >= DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES:
|
||||||
|
state.status = "paused"
|
||||||
|
state.paused_reason = (
|
||||||
|
f"judge model returned unparseable output {state.consecutive_parse_failures} turns in a row"
|
||||||
|
)
|
||||||
|
save_goal(self.session_id, state)
|
||||||
|
return {
|
||||||
|
"status": "paused",
|
||||||
|
"should_continue": False,
|
||||||
|
"continuation_prompt": None,
|
||||||
|
"verdict": "continue",
|
||||||
|
"reason": reason,
|
||||||
|
"message": (
|
||||||
|
f"⏸ Goal paused — the judge model ({state.consecutive_parse_failures} turns) "
|
||||||
|
"isn't returning the required JSON verdict. Route the judge to a stricter "
|
||||||
|
"model in ~/.hermes/config.yaml:\n"
|
||||||
|
" auxiliary:\n"
|
||||||
|
" goal_judge:\n"
|
||||||
|
" provider: openrouter\n"
|
||||||
|
" model: google/gemini-3-flash-preview\n"
|
||||||
|
"Then /goal resume to continue."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
if state.turns_used >= state.max_turns:
|
if state.turns_used >= state.max_turns:
|
||||||
state.status = "paused"
|
state.status = "paused"
|
||||||
state.paused_reason = f"turn budget exhausted ({state.turns_used}/{state.max_turns})"
|
state.paused_reason = f"turn budget exhausted ({state.turns_used}/{state.max_turns})"
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,7 @@ AUTHOR_MAP = {
|
||||||
"223003280+Abd0r@users.noreply.github.com": "Abd0r",
|
"223003280+Abd0r@users.noreply.github.com": "Abd0r",
|
||||||
"abdielv@proton.me": "AJV20",
|
"abdielv@proton.me": "AJV20",
|
||||||
"mason@growagainorchids.com": "masonjames",
|
"mason@growagainorchids.com": "masonjames",
|
||||||
|
"ytchen0719@gmail.com": "liquidchen",
|
||||||
"am@studio1.tailb672fe.ts.net": "subtract0",
|
"am@studio1.tailb672fe.ts.net": "subtract0",
|
||||||
"axmaiqiu@gmail.com": "qWaitCrypto",
|
"axmaiqiu@gmail.com": "qWaitCrypto",
|
||||||
"159539633+MottledShadow@users.noreply.github.com": "MottledShadow",
|
"159539633+MottledShadow@users.noreply.github.com": "MottledShadow",
|
||||||
|
|
|
||||||
|
|
@ -61,8 +61,9 @@ class _RecordingAdapter:
|
||||||
return _R()
|
return _R()
|
||||||
|
|
||||||
|
|
||||||
def _make_runner_with_adapter():
|
def _make_runner_with_adapter(session_id: str = None):
|
||||||
from gateway.run import GatewayRunner
|
from gateway.run import GatewayRunner
|
||||||
|
import uuid
|
||||||
|
|
||||||
runner = object.__new__(GatewayRunner)
|
runner = object.__new__(GatewayRunner)
|
||||||
runner.config = GatewayConfig(
|
runner.config = GatewayConfig(
|
||||||
|
|
@ -74,9 +75,12 @@ def _make_runner_with_adapter():
|
||||||
runner._queued_events = {}
|
runner._queued_events = {}
|
||||||
|
|
||||||
src = _make_source()
|
src = _make_source()
|
||||||
|
# Default to a unique session_id so xdist parallel runs on the same worker
|
||||||
|
# don't see each other's GoalManager state (DEFAULT_DB_PATH gets frozen at
|
||||||
|
# module-import time, defeating per-test HERMES_HOME monkeypatches).
|
||||||
session_entry = SessionEntry(
|
session_entry = SessionEntry(
|
||||||
session_key=build_session_key(src),
|
session_key=build_session_key(src),
|
||||||
session_id="goal-sess-1",
|
session_id=session_id or f"goal-sess-{uuid.uuid4().hex[:8]}",
|
||||||
created_at=datetime.now(),
|
created_at=datetime.now(),
|
||||||
updated_at=datetime.now(),
|
updated_at=datetime.now(),
|
||||||
platform=Platform.TELEGRAM,
|
platform=Platform.TELEGRAM,
|
||||||
|
|
@ -103,8 +107,8 @@ async def test_goal_verdict_done_sent_via_adapter_send(hermes_home):
|
||||||
mgr = GoalManager(session_entry.session_id)
|
mgr = GoalManager(session_entry.session_id)
|
||||||
mgr.set("ship the feature")
|
mgr.set("ship the feature")
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped")):
|
with patch("hermes_cli.goals.judge_goal", return_value=("done", "the feature shipped", False)):
|
||||||
runner._post_turn_goal_continuation(
|
await runner._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
source=src,
|
source=src,
|
||||||
final_response="I shipped the feature.",
|
final_response="I shipped the feature.",
|
||||||
|
|
@ -132,8 +136,8 @@ async def test_goal_verdict_continue_enqueues_continuation(hermes_home):
|
||||||
mgr = GoalManager(session_entry.session_id)
|
mgr = GoalManager(session_entry.session_id)
|
||||||
mgr.set("polish the docs")
|
mgr.set("polish the docs")
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work")):
|
with patch("hermes_cli.goals.judge_goal", return_value=("continue", "still needs work", False)):
|
||||||
runner._post_turn_goal_continuation(
|
await runner._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
source=src,
|
source=src,
|
||||||
final_response="here's a partial edit",
|
final_response="here's a partial edit",
|
||||||
|
|
@ -160,8 +164,8 @@ async def test_goal_verdict_budget_exhausted_sends_pause(hermes_home):
|
||||||
state.turns_used = 2
|
state.turns_used = 2
|
||||||
save_goal(session_entry.session_id, state)
|
save_goal(session_entry.session_id, state)
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going")):
|
with patch("hermes_cli.goals.judge_goal", return_value=("continue", "keep going", False)):
|
||||||
runner._post_turn_goal_continuation(
|
await runner._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
source=src,
|
source=src,
|
||||||
final_response="still partial",
|
final_response="still partial",
|
||||||
|
|
@ -181,7 +185,7 @@ async def test_goal_verdict_skipped_when_no_active_goal(hermes_home):
|
||||||
"""No goal set → the hook is a no-op. Nothing is sent, nothing enqueued."""
|
"""No goal set → the hook is a no-op. Nothing is sent, nothing enqueued."""
|
||||||
runner, adapter, session_entry, src = _make_runner_with_adapter()
|
runner, adapter, session_entry, src = _make_runner_with_adapter()
|
||||||
|
|
||||||
runner._post_turn_goal_continuation(
|
await runner._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
source=src,
|
source=src,
|
||||||
final_response="anything",
|
final_response="anything",
|
||||||
|
|
@ -207,9 +211,9 @@ async def test_goal_verdict_survives_adapter_without_send(hermes_home):
|
||||||
|
|
||||||
runner.adapters[Platform.TELEGRAM] = _NoSendAdapter()
|
runner.adapters[Platform.TELEGRAM] = _NoSendAdapter()
|
||||||
|
|
||||||
with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok")):
|
with patch("hermes_cli.goals.judge_goal", return_value=("done", "ok", False)):
|
||||||
# must not raise
|
# must not raise
|
||||||
runner._post_turn_goal_continuation(
|
await runner._post_turn_goal_continuation(
|
||||||
session_entry=session_entry,
|
session_entry=session_entry,
|
||||||
source=src,
|
source=src,
|
||||||
final_response="whatever",
|
final_response="whatever",
|
||||||
|
|
|
||||||
|
|
@ -40,14 +40,14 @@ class TestParseJudgeResponse:
|
||||||
def test_clean_json_done(self):
|
def test_clean_json_done(self):
|
||||||
from hermes_cli.goals import _parse_judge_response
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
done, reason = _parse_judge_response('{"done": true, "reason": "all good"}')
|
done, reason, _ = _parse_judge_response('{"done": true, "reason": "all good"}')
|
||||||
assert done is True
|
assert done is True
|
||||||
assert reason == "all good"
|
assert reason == "all good"
|
||||||
|
|
||||||
def test_clean_json_continue(self):
|
def test_clean_json_continue(self):
|
||||||
from hermes_cli.goals import _parse_judge_response
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
done, reason = _parse_judge_response('{"done": false, "reason": "more work needed"}')
|
done, reason, _ = _parse_judge_response('{"done": false, "reason": "more work needed"}')
|
||||||
assert done is False
|
assert done is False
|
||||||
assert reason == "more work needed"
|
assert reason == "more work needed"
|
||||||
|
|
||||||
|
|
@ -55,7 +55,7 @@ class TestParseJudgeResponse:
|
||||||
from hermes_cli.goals import _parse_judge_response
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
raw = '```json\n{"done": true, "reason": "done"}\n```'
|
raw = '```json\n{"done": true, "reason": "done"}\n```'
|
||||||
done, reason = _parse_judge_response(raw)
|
done, reason, _ = _parse_judge_response(raw)
|
||||||
assert done is True
|
assert done is True
|
||||||
assert "done" in reason
|
assert "done" in reason
|
||||||
|
|
||||||
|
|
@ -64,7 +64,7 @@ class TestParseJudgeResponse:
|
||||||
from hermes_cli.goals import _parse_judge_response
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
raw = 'Looking at this... the agent says X. Verdict: {"done": false, "reason": "partial"}'
|
raw = 'Looking at this... the agent says X. Verdict: {"done": false, "reason": "partial"}'
|
||||||
done, reason = _parse_judge_response(raw)
|
done, reason, _ = _parse_judge_response(raw)
|
||||||
assert done is False
|
assert done is False
|
||||||
assert reason == "partial"
|
assert reason == "partial"
|
||||||
|
|
||||||
|
|
@ -72,24 +72,24 @@ class TestParseJudgeResponse:
|
||||||
from hermes_cli.goals import _parse_judge_response
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
for s in ("true", "yes", "done", "1"):
|
for s in ("true", "yes", "done", "1"):
|
||||||
done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
|
done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
|
||||||
assert done is True
|
assert done is True
|
||||||
for s in ("false", "no", "not yet"):
|
for s in ("false", "no", "not yet"):
|
||||||
done, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
|
done, _, _ = _parse_judge_response(f'{{"done": "{s}", "reason": "r"}}')
|
||||||
assert done is False
|
assert done is False
|
||||||
|
|
||||||
def test_malformed_json_fails_open(self):
|
def test_malformed_json_fails_open(self):
|
||||||
"""Non-JSON → not done, with error-ish reason (so judge_goal can map to continue)."""
|
"""Non-JSON → not done, with error-ish reason (so judge_goal can map to continue)."""
|
||||||
from hermes_cli.goals import _parse_judge_response
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
done, reason = _parse_judge_response("this is not json at all")
|
done, reason, _ = _parse_judge_response("this is not json at all")
|
||||||
assert done is False
|
assert done is False
|
||||||
assert reason # non-empty
|
assert reason # non-empty
|
||||||
|
|
||||||
def test_empty_response(self):
|
def test_empty_response(self):
|
||||||
from hermes_cli.goals import _parse_judge_response
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
done, reason = _parse_judge_response("")
|
done, reason, _ = _parse_judge_response("")
|
||||||
assert done is False
|
assert done is False
|
||||||
assert reason
|
assert reason
|
||||||
|
|
||||||
|
|
@ -103,13 +103,13 @@ class TestJudgeGoal:
|
||||||
def test_empty_goal_skipped(self):
|
def test_empty_goal_skipped(self):
|
||||||
from hermes_cli.goals import judge_goal
|
from hermes_cli.goals import judge_goal
|
||||||
|
|
||||||
verdict, _ = judge_goal("", "some response")
|
verdict, _, _ = judge_goal("", "some response")
|
||||||
assert verdict == "skipped"
|
assert verdict == "skipped"
|
||||||
|
|
||||||
def test_empty_response_continues(self):
|
def test_empty_response_continues(self):
|
||||||
from hermes_cli.goals import judge_goal
|
from hermes_cli.goals import judge_goal
|
||||||
|
|
||||||
verdict, _ = judge_goal("ship the thing", "")
|
verdict, _, _ = judge_goal("ship the thing", "")
|
||||||
assert verdict == "continue"
|
assert verdict == "continue"
|
||||||
|
|
||||||
def test_no_aux_client_continues(self):
|
def test_no_aux_client_continues(self):
|
||||||
|
|
@ -120,7 +120,7 @@ class TestJudgeGoal:
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
||||||
return_value=(None, None),
|
return_value=(None, None),
|
||||||
):
|
):
|
||||||
verdict, _ = goals.judge_goal("my goal", "my response")
|
verdict, _, _ = goals.judge_goal("my goal", "my response")
|
||||||
assert verdict == "continue"
|
assert verdict == "continue"
|
||||||
|
|
||||||
def test_api_error_continues(self):
|
def test_api_error_continues(self):
|
||||||
|
|
@ -133,7 +133,7 @@ class TestJudgeGoal:
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
||||||
return_value=(fake_client, "judge-model"),
|
return_value=(fake_client, "judge-model"),
|
||||||
):
|
):
|
||||||
verdict, reason = goals.judge_goal("goal", "response")
|
verdict, reason, _ = goals.judge_goal("goal", "response")
|
||||||
assert verdict == "continue"
|
assert verdict == "continue"
|
||||||
assert "judge error" in reason.lower()
|
assert "judge error" in reason.lower()
|
||||||
|
|
||||||
|
|
@ -152,7 +152,7 @@ class TestJudgeGoal:
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
||||||
return_value=(fake_client, "judge-model"),
|
return_value=(fake_client, "judge-model"),
|
||||||
):
|
):
|
||||||
verdict, reason = goals.judge_goal("goal", "agent response")
|
verdict, reason, _ = goals.judge_goal("goal", "agent response")
|
||||||
assert verdict == "done"
|
assert verdict == "done"
|
||||||
assert reason == "achieved"
|
assert reason == "achieved"
|
||||||
|
|
||||||
|
|
@ -171,7 +171,7 @@ class TestJudgeGoal:
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
||||||
return_value=(fake_client, "judge-model"),
|
return_value=(fake_client, "judge-model"),
|
||||||
):
|
):
|
||||||
verdict, reason = goals.judge_goal("goal", "agent response")
|
verdict, reason, _ = goals.judge_goal("goal", "agent response")
|
||||||
assert verdict == "continue"
|
assert verdict == "continue"
|
||||||
assert reason == "not yet"
|
assert reason == "not yet"
|
||||||
|
|
||||||
|
|
@ -260,7 +260,7 @@ class TestGoalManager:
|
||||||
mgr = GoalManager(session_id="eval-sid-1")
|
mgr = GoalManager(session_id="eval-sid-1")
|
||||||
mgr.set("ship it")
|
mgr.set("ship it")
|
||||||
|
|
||||||
with patch.object(goals, "judge_goal", return_value=("done", "shipped")):
|
with patch.object(goals, "judge_goal", return_value=("done", "shipped", False)):
|
||||||
decision = mgr.evaluate_after_turn("I shipped the feature.")
|
decision = mgr.evaluate_after_turn("I shipped the feature.")
|
||||||
|
|
||||||
assert decision["verdict"] == "done"
|
assert decision["verdict"] == "done"
|
||||||
|
|
@ -276,7 +276,7 @@ class TestGoalManager:
|
||||||
mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
|
mgr = GoalManager(session_id="eval-sid-2", default_max_turns=5)
|
||||||
mgr.set("a long goal")
|
mgr.set("a long goal")
|
||||||
|
|
||||||
with patch.object(goals, "judge_goal", return_value=("continue", "more work")):
|
with patch.object(goals, "judge_goal", return_value=("continue", "more work", False)):
|
||||||
decision = mgr.evaluate_after_turn("made some progress")
|
decision = mgr.evaluate_after_turn("made some progress")
|
||||||
|
|
||||||
assert decision["verdict"] == "continue"
|
assert decision["verdict"] == "continue"
|
||||||
|
|
@ -294,7 +294,7 @@ class TestGoalManager:
|
||||||
mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
|
mgr = GoalManager(session_id="eval-sid-3", default_max_turns=2)
|
||||||
mgr.set("hard goal")
|
mgr.set("hard goal")
|
||||||
|
|
||||||
with patch.object(goals, "judge_goal", return_value=("continue", "not yet")):
|
with patch.object(goals, "judge_goal", return_value=("continue", "not yet", False)):
|
||||||
d1 = mgr.evaluate_after_turn("step 1")
|
d1 = mgr.evaluate_after_turn("step 1")
|
||||||
assert d1["should_continue"] is True
|
assert d1["should_continue"] is True
|
||||||
assert mgr.state.turns_used == 1
|
assert mgr.state.turns_used == 1
|
||||||
|
|
@ -356,3 +356,161 @@ def test_goal_command_dispatches_in_cli_registry_helpers():
|
||||||
assert "/goal" in COMMANDS
|
assert "/goal" in COMMANDS
|
||||||
session_cmds = COMMANDS_BY_CATEGORY.get("Session", {})
|
session_cmds = COMMANDS_BY_CATEGORY.get("Session", {})
|
||||||
assert "/goal" in session_cmds
|
assert "/goal" in session_cmds
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
# Auto-pause on consecutive judge parse failures
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestJudgeParseFailureAutoPause:
|
||||||
|
"""Regression: weak judge models (e.g. deepseek-v4-flash) that return
|
||||||
|
empty strings or non-JSON prose must auto-pause the loop after N turns
|
||||||
|
instead of burning the whole turn budget."""
|
||||||
|
|
||||||
|
def test_parse_response_flags_empty_as_parse_failure(self):
|
||||||
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
|
done, reason, parse_failed = _parse_judge_response("")
|
||||||
|
assert done is False
|
||||||
|
assert parse_failed is True
|
||||||
|
assert "empty" in reason.lower()
|
||||||
|
|
||||||
|
def test_parse_response_flags_non_json_as_parse_failure(self):
|
||||||
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
|
done, reason, parse_failed = _parse_judge_response(
|
||||||
|
"Let me analyze whether the goal is fully satisfied based on the agent's response..."
|
||||||
|
)
|
||||||
|
assert done is False
|
||||||
|
assert parse_failed is True
|
||||||
|
assert "not json" in reason.lower()
|
||||||
|
|
||||||
|
def test_parse_response_clean_json_is_not_parse_failure(self):
|
||||||
|
from hermes_cli.goals import _parse_judge_response
|
||||||
|
|
||||||
|
done, _, parse_failed = _parse_judge_response(
|
||||||
|
'{"done": false, "reason": "more work"}'
|
||||||
|
)
|
||||||
|
assert done is False
|
||||||
|
assert parse_failed is False
|
||||||
|
|
||||||
|
def test_api_error_does_not_count_as_parse_failure(self):
|
||||||
|
"""Transient network/API errors must not trip the auto-pause guard."""
|
||||||
|
from hermes_cli import goals
|
||||||
|
|
||||||
|
fake_client = MagicMock()
|
||||||
|
fake_client.chat.completions.create.side_effect = RuntimeError("connection reset")
|
||||||
|
with patch(
|
||||||
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
||||||
|
return_value=(fake_client, "judge-model"),
|
||||||
|
):
|
||||||
|
verdict, _, parse_failed = goals.judge_goal("goal", "response")
|
||||||
|
assert verdict == "continue"
|
||||||
|
assert parse_failed is False
|
||||||
|
|
||||||
|
def test_empty_judge_reply_flagged_as_parse_failure(self):
|
||||||
|
"""End-to-end: judge returns empty content → parse_failed=True."""
|
||||||
|
from hermes_cli import goals
|
||||||
|
|
||||||
|
fake_client = MagicMock()
|
||||||
|
fake_client.chat.completions.create.return_value = MagicMock(
|
||||||
|
choices=[MagicMock(message=MagicMock(content=""))]
|
||||||
|
)
|
||||||
|
with patch(
|
||||||
|
"agent.auxiliary_client.get_text_auxiliary_client",
|
||||||
|
return_value=(fake_client, "judge-model"),
|
||||||
|
):
|
||||||
|
verdict, _, parse_failed = goals.judge_goal("goal", "response")
|
||||||
|
assert verdict == "continue"
|
||||||
|
assert parse_failed is True
|
||||||
|
|
||||||
|
def test_auto_pause_after_three_consecutive_parse_failures(self, hermes_home):
|
||||||
|
"""N=3 consecutive parse failures → auto-pause with config pointer."""
|
||||||
|
from hermes_cli import goals
|
||||||
|
from hermes_cli.goals import GoalManager, DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES
|
||||||
|
|
||||||
|
assert DEFAULT_MAX_CONSECUTIVE_PARSE_FAILURES == 3
|
||||||
|
mgr = GoalManager(session_id="parse-fail-sid-1", default_max_turns=20)
|
||||||
|
mgr.set("do a thing")
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
goals, "judge_goal", return_value=("continue", "judge returned empty response", True)
|
||||||
|
):
|
||||||
|
d1 = mgr.evaluate_after_turn("step 1")
|
||||||
|
assert d1["should_continue"] is True
|
||||||
|
assert mgr.state.consecutive_parse_failures == 1
|
||||||
|
|
||||||
|
d2 = mgr.evaluate_after_turn("step 2")
|
||||||
|
assert d2["should_continue"] is True
|
||||||
|
assert mgr.state.consecutive_parse_failures == 2
|
||||||
|
|
||||||
|
d3 = mgr.evaluate_after_turn("step 3")
|
||||||
|
assert d3["should_continue"] is False
|
||||||
|
assert d3["status"] == "paused"
|
||||||
|
assert mgr.state.consecutive_parse_failures == 3
|
||||||
|
# Message points at the config surface so the user can fix it.
|
||||||
|
assert "auxiliary" in d3["message"]
|
||||||
|
assert "goal_judge" in d3["message"]
|
||||||
|
assert "config.yaml" in d3["message"]
|
||||||
|
|
||||||
|
def test_parse_failure_counter_resets_on_good_reply(self, hermes_home):
|
||||||
|
"""A single good judge reply resets the counter — transient flakes don't pause."""
|
||||||
|
from hermes_cli import goals
|
||||||
|
from hermes_cli.goals import GoalManager
|
||||||
|
|
||||||
|
mgr = GoalManager(session_id="parse-fail-sid-2", default_max_turns=20)
|
||||||
|
mgr.set("another goal")
|
||||||
|
|
||||||
|
# Two parse failures…
|
||||||
|
with patch.object(
|
||||||
|
goals, "judge_goal", return_value=("continue", "not json", True)
|
||||||
|
):
|
||||||
|
mgr.evaluate_after_turn("step 1")
|
||||||
|
mgr.evaluate_after_turn("step 2")
|
||||||
|
assert mgr.state.consecutive_parse_failures == 2
|
||||||
|
|
||||||
|
# …then one clean reply resets the counter.
|
||||||
|
with patch.object(
|
||||||
|
goals, "judge_goal", return_value=("continue", "making progress", False)
|
||||||
|
):
|
||||||
|
d = mgr.evaluate_after_turn("step 3")
|
||||||
|
assert d["should_continue"] is True
|
||||||
|
assert mgr.state.consecutive_parse_failures == 0
|
||||||
|
|
||||||
|
def test_parse_failure_counter_not_incremented_by_api_errors(self, hermes_home):
|
||||||
|
"""API/transport errors must NOT count toward the auto-pause threshold."""
|
||||||
|
from hermes_cli import goals
|
||||||
|
from hermes_cli.goals import GoalManager
|
||||||
|
|
||||||
|
mgr = GoalManager(session_id="parse-fail-sid-3", default_max_turns=20)
|
||||||
|
mgr.set("goal")
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
goals, "judge_goal", return_value=("continue", "judge error: RuntimeError", False)
|
||||||
|
):
|
||||||
|
for _ in range(5):
|
||||||
|
d = mgr.evaluate_after_turn("still going")
|
||||||
|
assert d["should_continue"] is True
|
||||||
|
assert mgr.state.consecutive_parse_failures == 0
|
||||||
|
assert mgr.state.status == "active"
|
||||||
|
|
||||||
|
def test_consecutive_parse_failures_persists_across_goalmanager_reloads(
|
||||||
|
self, hermes_home
|
||||||
|
):
|
||||||
|
"""The counter must be durable so cross-session resumes see it."""
|
||||||
|
from hermes_cli import goals
|
||||||
|
from hermes_cli.goals import GoalManager, load_goal
|
||||||
|
|
||||||
|
mgr = GoalManager(session_id="parse-fail-sid-4", default_max_turns=20)
|
||||||
|
mgr.set("persistent goal")
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
goals, "judge_goal", return_value=("continue", "empty", True)
|
||||||
|
):
|
||||||
|
mgr.evaluate_after_turn("r")
|
||||||
|
mgr.evaluate_after_turn("r")
|
||||||
|
|
||||||
|
reloaded = load_goal("parse-fail-sid-4")
|
||||||
|
assert reloaded is not None
|
||||||
|
assert reloaded.consecutive_parse_failures == 2
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue