mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(goals): raise judge max_tokens 200 → 4096, make configurable
The freeform /goal judge was capped at max_tokens=200, which reliably
truncated the JSON verdict on reasoning-heavy models (deepseek-v4-pro,
qwq, etc.) — the model burns tokens on hidden reasoning before emitting
visible content, and the first /goal turn's prompt is larger than later
turns, blowing past 200. Symptom: agent.log shows
`judge reply was not JSON: '{"done": true, "reason": "The agent successfully'`
followed by repeated `judge returned empty response` lines, then the
goal pauses with a misleading 'judge model isn't returning the required
JSON verdict' message.
Diagnosed live by @helix4u — empirically verified that raising the
budget on an unmodified worktree makes the failures go away on the
exact configs users were hitting on Nous Plus subscription paths.
Changes:
- DEFAULT_JUDGE_MAX_TOKENS = 4096 (up from 200)
- New auxiliary.goal_judge.max_tokens config knob for tuning in
specifically constrained setups
- _goal_judge_max_tokens() resolves the value with fail-open semantics
(non-int / non-positive / load failure → default). load_config() is
mtime-cached so per-turn lookup is cheap.
Scoped narrowly to the verified root cause — does not introduce a
submit_verdict tool-call schema (see #26162 / #23671 for that direction;
they can land separately if we want them).
Tests: tests/hermes_cli/test_goals.py + tests/cli/test_cli_goal_interrupt.py
+ tests/gateway/test_goal_verdict_send.py — 62/62 passing.
E2E verified: config override honored (8192), missing/garbage/zero
values fall back to 4096, no-auxiliary-section falls back to 4096.
Co-authored-by: helix4u <4317663+helix4u@users.noreply.github.com>
Credits:
- @helix4u (Gille) — diagnosed the max_tokens=200 truncation via live
testing on an unmodified worktree, drafted the original fix shape
in #26162.
- @AhmetArif0 — flagged the freeform judge fragility in #23671 from
the tool-call angle.
- @0xharryriddle (HarryRiddle.eth) — reported the issue from a Nous
Plus subscription setup in #23876 with full debug reports.
Closes #23876
Supersedes #26162, #23671, #23881
This commit is contained in:
parent
965ae7fa97
commit
f9ad7400e3
1 changed files with 35 additions and 1 deletions
|
|
@ -45,6 +45,16 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
DEFAULT_MAX_TURNS = 20
|
||||
DEFAULT_JUDGE_TIMEOUT = 30.0
|
||||
# Judge output budget. The freeform judge returns a one-line JSON verdict, but
|
||||
# reasoning models (deepseek-v4, qwq, etc.) burn tokens on hidden reasoning
|
||||
# before emitting the visible JSON — and the first /goal turn's prompt is
|
||||
# larger than later turns, which pushes total reply length past tight caps.
|
||||
# 200 tokens (the original default) reliably truncated the JSON on reasoning
|
||||
# models, leaving '{"done": true, "reason": "The agent successfully' and
|
||||
# triggering the auto-pause. 4096 covers reasoning + verdict on every model
|
||||
# we've live-tested; override via auxiliary.goal_judge.max_tokens for
|
||||
# specifically constrained setups.
|
||||
DEFAULT_JUDGE_MAX_TOKENS = 4096
|
||||
# Cap how much of the last response + recent messages we send to the judge.
|
||||
_JUDGE_RESPONSE_SNIPPET_CHARS = 4000
|
||||
# After this many consecutive judge *parse* failures (empty output / non-JSON),
|
||||
|
|
@ -282,6 +292,30 @@ def _truncate(text: str, limit: int) -> str:
|
|||
_JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)
|
||||
|
||||
|
||||
def _goal_judge_max_tokens() -> int:
|
||||
"""Resolve auxiliary.goal_judge.max_tokens, falling back to the default.
|
||||
|
||||
``load_config()`` is cached on the config file's (mtime, size), so calling
|
||||
this once per judge turn is cheap. A non-positive or non-int value falls
|
||||
back to the default rather than crashing the goal loop.
|
||||
"""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
cfg = load_config()
|
||||
value = (
|
||||
(cfg.get("auxiliary") or {})
|
||||
.get("goal_judge", {})
|
||||
.get("max_tokens", DEFAULT_JUDGE_MAX_TOKENS)
|
||||
)
|
||||
value = int(value)
|
||||
if value > 0:
|
||||
return value
|
||||
except Exception:
|
||||
pass
|
||||
return DEFAULT_JUDGE_MAX_TOKENS
|
||||
|
||||
|
||||
def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
|
||||
"""Parse the judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.
|
||||
|
||||
|
|
@ -404,7 +438,7 @@ def judge_goal(
|
|||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0,
|
||||
max_tokens=200,
|
||||
max_tokens=_goal_judge_max_tokens(),
|
||||
timeout=timeout,
|
||||
extra_body=get_auxiliary_extra_body() or None,
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue