diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py index 1542b9a7a38..62ee00547c1 100644 --- a/hermes_cli/goals.py +++ b/hermes_cli/goals.py @@ -45,6 +45,16 @@ logger = logging.getLogger(__name__) DEFAULT_MAX_TURNS = 20 DEFAULT_JUDGE_TIMEOUT = 30.0 +# Judge output budget. The freeform judge returns a one-line JSON verdict, but +# reasoning models (deepseek-v4, qwq, etc.) burn tokens on hidden reasoning +# before emitting the visible JSON — and the first /goal turn's prompt is +# larger than later turns, which pushes total reply length past tight caps. +# 200 tokens (the original default) reliably truncated the JSON on reasoning +# models, leaving '{"done": true, "reason": "The agent successfully' and +# triggering the auto-pause. 4096 covers reasoning + verdict on every model +# we've live-tested; override via auxiliary.goal_judge.max_tokens for +# specifically constrained setups. +DEFAULT_JUDGE_MAX_TOKENS = 4096 # Cap how much of the last response + recent messages we send to the judge. _JUDGE_RESPONSE_SNIPPET_CHARS = 4000 # After this many consecutive judge *parse* failures (empty output / non-JSON), @@ -282,6 +292,30 @@ def _truncate(text: str, limit: int) -> str: _JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL) +def _goal_judge_max_tokens() -> int: + """Resolve auxiliary.goal_judge.max_tokens, falling back to the default. + + ``load_config()`` is cached on the config file's (mtime, size), so calling + this once per judge turn is cheap. A non-positive or non-int value falls + back to the default rather than crashing the goal loop. + """ + try: + from hermes_cli.config import load_config + + cfg = load_config() + value = ( + (cfg.get("auxiliary") or {}) + .get("goal_judge", {}) + .get("max_tokens", DEFAULT_JUDGE_MAX_TOKENS) + ) + value = int(value) + if value > 0: + return value + except Exception: + pass + return DEFAULT_JUDGE_MAX_TOKENS + + def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]: """Parse the judge's reply. Fail-open to ``(False, "", parse_failed)``. @@ -404,7 +438,7 @@ def judge_goal( {"role": "user", "content": prompt}, ], temperature=0, - max_tokens=200, + max_tokens=_goal_judge_max_tokens(), timeout=timeout, extra_body=get_auxiliary_extra_body() or None, )