From f9ad7400e30517159712a77e6a4bc2f3a390b2db Mon Sep 17 00:00:00 2001
From: teknium1 <127238744+teknium1@users.noreply.github.com>
Date: Thu, 14 May 2026 23:43:13 -0700
Subject: [PATCH] =?UTF-8?q?fix(goals):=20raise=20judge=20max=5Ftokens=2020?=
 =?UTF-8?q?0=20=E2=86=92=204096,=20make=20configurable?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The freeform /goal judge was capped at max_tokens=200, which reliably
truncated the JSON verdict on reasoning-heavy models (deepseek-v4-pro,
qwq, etc.) — the model burns tokens on hidden reasoning before emitting
visible content, and the first /goal turn's prompt is larger than later
turns, blowing past 200. Symptom: agent.log shows
`judge reply was not JSON: '{"done": true, "reason": "The agent successfully'`
followed by repeated `judge returned empty response` lines, then the
goal pauses with a misleading 'judge model isn't returning the required
JSON verdict' message.

Diagnosed live by @helix4u — empirically verified that raising the
budget on an unmodified worktree makes the failures go away on the
exact configs users were hitting on Nous Plus subscription paths.

Changes:
- DEFAULT_JUDGE_MAX_TOKENS = 4096 (up from 200)
- New auxiliary.goal_judge.max_tokens config knob for tuning in
  specifically constrained setups
- _goal_judge_max_tokens() resolves the value with fail-open semantics
  (non-int / non-positive / load failure → default). load_config() is
  mtime-cached so per-turn lookup is cheap.

Scoped narrowly to the verified root cause — does not introduce a
submit_verdict tool-call schema (see #26162 / #23671 for that direction;
they can land separately if we want them).

Tests: tests/hermes_cli/test_goals.py + tests/cli/test_cli_goal_interrupt.py
+ tests/gateway/test_goal_verdict_send.py — 62/62 passing.

E2E verified: config override honored (8192), missing/garbage/zero
values fall back to 4096, no-auxiliary-section falls back to 4096.

Co-authored-by: helix4u <4317663+helix4u@users.noreply.github.com>

Credits:
- @helix4u (Gille) — diagnosed the max_tokens=200 truncation via live
  testing on an unmodified worktree, drafted the original fix shape
  in #26162.
- @AhmetArif0 — flagged the freeform judge fragility in #23671 from
  the tool-call angle.
- @0xharryriddle (HarryRiddle.eth) — reported the issue from a Nous
  Plus subscription setup in #23876 with full debug reports.

Closes #23876
Supersedes #26162, #23671, #23881
---
 hermes_cli/goals.py | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py
index 1542b9a7a38..62ee00547c1 100644
--- a/hermes_cli/goals.py
+++ b/hermes_cli/goals.py
@@ -45,6 +45,16 @@ logger = logging.getLogger(__name__)
 
 DEFAULT_MAX_TURNS = 20
 DEFAULT_JUDGE_TIMEOUT = 30.0
+# Judge output budget. The freeform judge returns a one-line JSON verdict, but
+# reasoning models (deepseek-v4, qwq, etc.) burn tokens on hidden reasoning
+# before emitting the visible JSON — and the first /goal turn's prompt is
+# larger than later turns, which pushes total reply length past tight caps.
+# 200 tokens (the original default) reliably truncated the JSON on reasoning
+# models, leaving '{"done": true, "reason": "The agent successfully' and
+# triggering the auto-pause. 4096 covers reasoning + verdict on every model
+# we've live-tested; override via auxiliary.goal_judge.max_tokens for
+# specifically constrained setups.
+DEFAULT_JUDGE_MAX_TOKENS = 4096
 # Cap how much of the last response + recent messages we send to the judge.
 _JUDGE_RESPONSE_SNIPPET_CHARS = 4000
 # After this many consecutive judge *parse* failures (empty output / non-JSON),
@@ -282,6 +292,30 @@ def _truncate(text: str, limit: int) -> str:
 _JSON_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)
 
 
+def _goal_judge_max_tokens() -> int:
+    """Resolve auxiliary.goal_judge.max_tokens, falling back to the default.
+
+    ``load_config()`` is cached on the config file's (mtime, size), so calling
+    this once per judge turn is cheap. A non-positive or non-int value falls
+    back to the default rather than crashing the goal loop.
+    """
+    try:
+        from hermes_cli.config import load_config
+
+        cfg = load_config()
+        value = (
+            (cfg.get("auxiliary") or {})
+            .get("goal_judge", {})
+            .get("max_tokens", DEFAULT_JUDGE_MAX_TOKENS)
+        )
+        value = int(value)
+        if value > 0:
+            return value
+    except Exception:
+        pass
+    return DEFAULT_JUDGE_MAX_TOKENS
+
+
 def _parse_judge_response(raw: str) -> Tuple[bool, str, bool]:
     """Parse the judge's reply. Fail-open to ``(False, "<reason>", parse_failed)``.
 
@@ -404,7 +438,7 @@ def judge_goal(
                 {"role": "user", "content": prompt},
             ],
             temperature=0,
-            max_tokens=200,
+            max_tokens=_goal_judge_max_tokens(),
             timeout=timeout,
             extra_body=get_auxiliary_extra_body() or None,
         )