From 04de307d62277998ee8e52dfa4da59b539917721 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 19:04:40 -0700 Subject: [PATCH 01/89] fix(cli): repaint input area after inline /steer and /model submit (#34839) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit handle_enter dispatches /steer and /model inline on the UI thread while the agent is running, calling buffer.reset() then returning. Unlike every other early-return branch in the handler, these two skipped event.app.invalidate(). process_command() prints through patch_stdout (scrolls output above the prompt without redrawing the input line), so the just-cleared input area could keep showing the submitted '/steer ' until an unrelated redraw fired — looking unsent and inviting an accidental re-submit. Add event.app.invalidate() after reset in both inline branches to match the sibling branches. AST regression test pins the invariant: every reset-then-return branch in handle_enter must invalidate first. Fixes #34569 --- cli.py | 14 +++ tests/cli/test_steer_inline_repaint_34569.py | 116 +++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 tests/cli/test_steer_inline_repaint_34569.py diff --git a/cli.py b/cli.py index 23035297f4f..770483df51b 100644 --- a/cli.py +++ b/cli.py @@ -12945,6 +12945,13 @@ class HermesCLI: if event.app.is_running: event.app.exit() event.app.current_buffer.reset(append_to_history=True) + # Force a repaint: process_command() prints through + # patch_stdout (scrolls output above the prompt) and never + # invalidates the app, so the just-cleared input area can + # keep showing the submitted text until some unrelated + # redraw fires. Every other early-return branch in this + # handler invalidates after reset — match them. + event.app.invalidate() return # Handle /steer while the agent is running immediately on the @@ -12956,6 +12963,13 @@ class HermesCLI: if self._should_handle_steer_command_inline(text, has_images=has_images): self.process_command(text) event.app.current_buffer.reset(append_to_history=True) + # Force a repaint after clearing the buffer. /steer is + # dispatched mid-run while the agent streams output through + # patch_stdout; process_command() never invalidates the + # app, so without this the submitted "/steer " can + # linger in the input area (looking unsent) and invite an + # accidental re-submit. See issue #34569. + event.app.invalidate() return # Snapshot and clear attached images diff --git a/tests/cli/test_steer_inline_repaint_34569.py b/tests/cli/test_steer_inline_repaint_34569.py new file mode 100644 index 00000000000..8c0bce3d125 --- /dev/null +++ b/tests/cli/test_steer_inline_repaint_34569.py @@ -0,0 +1,116 @@ +"""Regression guard for issue #34569 — inline /steer (and /model) submit +must repaint the input area after clearing the buffer. + +Mechanism of the bug +-------------------- +``handle_enter`` dispatches ``/steer`` (and ``/model``) inline on the UI +thread while the agent is running. Those branches called +``buffer.reset(append_to_history=True)`` but — unlike every *other* +early-return branch in the handler — did NOT call ``event.app.invalidate()``. +Because ``process_command()`` prints through ``patch_stdout`` (which scrolls +output above the prompt and never triggers a prompt_toolkit redraw), the +just-cleared input area could keep showing the submitted ``/steer `` +until some unrelated redraw fired. The user saw their submitted text as if +it were unsent and could accidentally re-submit it. + +This test pins the contract structurally: inside ``handle_enter``, any +inline-command early-return that resets the buffer must be followed by an +``event.app.invalidate()`` before its ``return``. It is an *invariant* +(every reset-then-return repaints), not a snapshot of current source. +""" + +from __future__ import annotations + +import ast +from pathlib import Path + + +def _load_handle_enter_node() -> ast.FunctionDef: + """Extract the ``handle_enter`` nested function node from cli.py.""" + cli_path = Path(__file__).resolve().parents[2] / "cli.py" + tree = ast.parse(cli_path.read_text(encoding="utf-8")) + + target = None + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef) and node.name == "handle_enter": + target = node + break + assert target is not None, "handle_enter closure not found in cli.py" + return target + + +def _is_buffer_reset(node: ast.stmt) -> bool: + """True if the statement is ``...current_buffer.reset(...)``.""" + if not isinstance(node, ast.Expr): + return False + call = node.value + if not isinstance(call, ast.Call): + return False + func = call.func + return isinstance(func, ast.Attribute) and func.attr == "reset" + + +def _is_invalidate(node: ast.stmt) -> bool: + """True if the statement is ``event.app.invalidate()``.""" + if not isinstance(node, ast.Expr): + return False + call = node.value + if not isinstance(call, ast.Call): + return False + func = call.func + return isinstance(func, ast.Attribute) and func.attr == "invalidate" + + +def _collect_reset_blocks(func: ast.FunctionDef) -> list[list[ast.stmt]]: + """Find every statement sequence (a block body/orelse/finalbody) within + ``handle_enter`` that contains a ``buffer.reset()`` call.""" + blocks: list[list[ast.stmt]] = [] + for node in ast.walk(func): + for attr in ("body", "orelse", "finalbody"): + seq = getattr(node, attr, None) + if not isinstance(seq, list): + continue + if any(isinstance(s, ast.stmt) and _is_buffer_reset(s) for s in seq): + blocks.append(seq) + return blocks + + +def test_inline_command_reset_branches_invalidate(): + """Every handle_enter branch that resets the buffer and then returns must + invalidate the app first (issue #34569).""" + func = _load_handle_enter_node() + reset_blocks = _collect_reset_blocks(func) + + assert reset_blocks, "expected to find buffer.reset() calls in handle_enter" + + offenders = [] + for seq in reset_blocks: + for i, stmt in enumerate(seq): + if not _is_buffer_reset(stmt): + continue + # Find the next return after this reset in the same block. + ret_idx = None + for j in range(i + 1, len(seq)): + if isinstance(seq[j], ast.Return): + ret_idx = j + break + if ret_idx is None: + # reset not directly followed by a return in this block + # (e.g. the fall-through reset at the end of the handler) — + # the next user input naturally repaints, so skip. + continue + between = seq[i + 1 : ret_idx] + if not any(_is_invalidate(s) for s in between): + offenders.append(ast.dump(stmt)) + + assert not offenders, ( + "handle_enter has reset-then-return branch(es) that never call " + "event.app.invalidate() — the input area can keep showing the " + "submitted text (issue #34569). Offending reset stmts:\n" + + "\n".join(offenders) + ) + + +if __name__ == "__main__": # pragma: no cover + test_inline_command_reset_branches_invalidate() + print("ok") From e38b0b55d12cfa39a6ac71d553d224c0711856f2 Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Fri, 29 May 2026 14:45:53 -0600 Subject: [PATCH 02/89] fix(compression): avoid repeat preflight compaction from rough estimates --- agent/context_compressor.py | 46 +++++++++++++++ agent/context_engine.py | 9 +++ agent/conversation_compression.py | 15 +++-- agent/conversation_loop.py | 31 ++++++++-- tests/agent/test_context_compressor.py | 29 ++++++++++ tests/run_agent/test_413_compression.py | 77 +++++++++++++++++++++++++ 6 files changed, 193 insertions(+), 14 deletions(-) diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 58829dbf4fb..cf9c534decd 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -518,6 +518,10 @@ class ContextCompressor(ContextEngine): self._last_compression_savings_pct = 100.0 self._ineffective_compression_count = 0 self._summary_failure_cooldown_until = 0.0 # transient errors must not block a fresh session + self.last_real_prompt_tokens = 0 + self.last_compression_rough_tokens = 0 + self.last_rough_tokens_when_real_prompt_fit = 0 + self.awaiting_real_usage_after_compression = False def update_model( self, @@ -615,6 +619,10 @@ class ContextCompressor(ContextEngine): self.last_prompt_tokens = 0 self.last_completion_tokens = 0 + self.last_real_prompt_tokens = 0 + self.last_compression_rough_tokens = 0 + self.last_rough_tokens_when_real_prompt_fit = 0 + self.awaiting_real_usage_after_compression = False self.summary_model = summary_model_override or "" @@ -648,6 +656,44 @@ class ContextCompressor(ContextEngine): self.last_prompt_tokens = usage.get("prompt_tokens", 0) self.last_completion_tokens = usage.get("completion_tokens", 0) self.last_total_tokens = usage.get("total_tokens", self.last_prompt_tokens + self.last_completion_tokens) + if self.last_prompt_tokens > 0: + self.last_real_prompt_tokens = self.last_prompt_tokens + if self.last_prompt_tokens < self.threshold_tokens: + if self.awaiting_real_usage_after_compression and self.last_compression_rough_tokens > 0: + self.last_rough_tokens_when_real_prompt_fit = self.last_compression_rough_tokens + else: + self.last_rough_tokens_when_real_prompt_fit = 0 + self.awaiting_real_usage_after_compression = False + + def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool: + """Return True when a high rough preflight estimate is known-noisy. + + ``estimate_request_tokens_rough(..., tools=...)`` intentionally + overestimates schema-heavy requests so Hermes compresses before a + provider rejects the payload. After a successful compressed API call, + though, provider ``prompt_tokens`` are a better signal than repeating + compaction from the same rough schema overhead. Defer only while the + rough estimate has grown modestly since a request the provider proved + fit under the threshold. + """ + if rough_tokens < self.threshold_tokens: + return False + if self.last_real_prompt_tokens <= 0: + return False + if self.last_real_prompt_tokens >= self.threshold_tokens: + return False + + baseline = self.last_rough_tokens_when_real_prompt_fit or self.last_compression_rough_tokens + if baseline <= 0: + return False + + growth = max(0, rough_tokens - baseline) + tolerated_growth = max(4096, int(self.threshold_tokens * 0.05)) + if growth > tolerated_growth: + return False + + self.last_rough_tokens_when_real_prompt_fit = max(baseline, rough_tokens) + return True def should_compress(self, prompt_tokens: int = None) -> bool: """Check if context exceeds the compression threshold. diff --git a/agent/context_engine.py b/agent/context_engine.py index bb426fc189d..79c31fb48e6 100644 --- a/agent/context_engine.py +++ b/agent/context_engine.py @@ -115,6 +115,15 @@ class ContextEngine(ABC): """ return False + def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool: + """Return True when preflight should trust recent real usage instead. + + Built-in compression uses this to avoid re-compacting from known-noisy + rough estimates after a compressed request has already fit. Third-party + engines can ignore it safely. + """ + return False + # -- Optional: manual /compress preflight ------------------------------ def has_content_to_compress(self, messages: List[Dict[str, Any]]) -> bool: diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index 9a93ba4a496..ba8678cc723 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -575,19 +575,18 @@ def compress_context( force=True, ) - # Update token estimate after compaction so pressure calculations - # use the post-compression count, not the stale pre-compression one. - # Use estimate_request_tokens_rough() so tool schemas are included — - # with 50+ tools enabled, schemas alone can add 20-30K tokens, and - # omitting them delays the next compression cycle far past the - # configured threshold (issue #14695). + # Keep the post-compression rough estimate for diagnostics, but do not + # treat it as provider-reported prompt usage. Schema-heavy rough estimates + # can remain above threshold even after the next real API request fits. _compressed_est = estimate_request_tokens_rough( compressed, system_prompt=new_system_prompt or "", tools=agent.tools or None, ) - agent.context_compressor.last_prompt_tokens = _compressed_est + agent.context_compressor.last_compression_rough_tokens = _compressed_est + agent.context_compressor.last_prompt_tokens = -1 agent.context_compressor.last_completion_tokens = 0 + agent.context_compressor.awaiting_real_usage_after_compression = True # Clear the file-read dedup cache. After compression the original # read content is summarised away — if the model re-reads the same @@ -599,7 +598,7 @@ def compress_context( pass logger.info( - "context compression done: session=%s messages=%d->%d tokens=~%s", + "context compression done: session=%s messages=%d->%d rough_tokens=~%s awaiting_real_usage=true", agent.session_id or "none", _pre_msg_count, len(compressed), f"{_compressed_est:,}", ) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index a6c975be391..f72014b9c0a 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -600,18 +600,32 @@ def run_conversation( system_prompt=active_system_prompt or "", tools=agent.tools or None, ) + _compressor = agent.context_compressor + _defer_preflight = getattr( + _compressor, + "should_defer_preflight_to_real_usage", + lambda _tokens: False, + ) - if agent.context_compressor.should_compress(_preflight_tokens): + if _defer_preflight(_preflight_tokens): + logger.info( + "Skipping preflight compression: rough estimate ~%s >= %s, " + "but last real provider prompt was %s after compression", + f"{_preflight_tokens:,}", + f"{_compressor.threshold_tokens:,}", + f"{_compressor.last_real_prompt_tokens:,}", + ) + elif _compressor.should_compress(_preflight_tokens): logger.info( "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)", f"{_preflight_tokens:,}", - f"{agent.context_compressor.threshold_tokens:,}", + f"{_compressor.threshold_tokens:,}", agent.model, - f"{agent.context_compressor.context_length:,}", + f"{_compressor.context_length:,}", ) agent._emit_status( f"📦 Preflight compression: ~{_preflight_tokens:,} tokens " - f">= {agent.context_compressor.threshold_tokens:,} threshold. " + f">= {_compressor.threshold_tokens:,} threshold. " "This may take a moment." ) # May need multiple passes for very large sessions with small @@ -646,8 +660,8 @@ def run_conversation( system_prompt=active_system_prompt or "", tools=agent.tools or None, ) - if _preflight_tokens < agent.context_compressor.threshold_tokens: - break # Under threshold + if not _compressor.should_compress(_preflight_tokens): + break # Under threshold or anti-thrash guard stopped it # Plugin hook: pre_llm_call # Fired once per turn before the tool-calling loop. Plugins can @@ -3862,6 +3876,11 @@ def run_conversation( # inflate completion_tokens with reasoning, # causing premature compression. (#12026) _real_tokens = _compressor.last_prompt_tokens + elif _compressor.last_prompt_tokens == -1: + # Compression just ran and no API-reported prompt count + # has arrived yet. Avoid treating a schema-heavy rough + # post-compression estimate as real context pressure. + _real_tokens = 0 else: # Include tool schemas — with 50+ tools enabled # these add 20-30K tokens the messages-only diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 0d7aa81f41f..5ce753864c9 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -41,6 +41,8 @@ class TestShouldCompress: class TestUpdateFromResponse: def test_updates_fields(self, compressor): + compressor.awaiting_real_usage_after_compression = True + compressor.last_compression_rough_tokens = 90_000 compressor.update_from_response({ "prompt_tokens": 5000, "completion_tokens": 1000, @@ -48,12 +50,39 @@ class TestUpdateFromResponse: }) assert compressor.last_prompt_tokens == 5000 assert compressor.last_completion_tokens == 1000 + assert compressor.last_real_prompt_tokens == 5000 + assert compressor.last_rough_tokens_when_real_prompt_fit == 90_000 + assert compressor.awaiting_real_usage_after_compression is False def test_missing_fields_default_zero(self, compressor): compressor.update_from_response({}) assert compressor.last_prompt_tokens == 0 +class TestPreflightDeferral: + def test_defers_when_recent_real_usage_fit_and_rough_growth_is_small(self, compressor): + compressor.threshold_tokens = 85_000 + compressor.last_real_prompt_tokens = 50_000 + compressor.last_rough_tokens_when_real_prompt_fit = 90_000 + + assert compressor.should_defer_preflight_to_real_usage(93_000) is True + assert compressor.last_rough_tokens_when_real_prompt_fit == 93_000 + + def test_does_not_defer_when_rough_growth_is_large(self, compressor): + compressor.threshold_tokens = 85_000 + compressor.last_real_prompt_tokens = 50_000 + compressor.last_rough_tokens_when_real_prompt_fit = 90_000 + + assert compressor.should_defer_preflight_to_real_usage(100_000) is False + + def test_does_not_defer_without_recent_real_usage(self, compressor): + compressor.threshold_tokens = 85_000 + compressor.last_real_prompt_tokens = 0 + compressor.last_rough_tokens_when_real_prompt_fit = 90_000 + + assert compressor.should_defer_preflight_to_real_usage(93_000) is False + + class TestCompress: def _make_messages(self, n): diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py index 6695d6c275e..37cafa7985d 100644 --- a/tests/run_agent/test_413_compression.py +++ b/tests/run_agent/test_413_compression.py @@ -491,6 +491,83 @@ class TestPreflightCompression: for ev, msg in status_messages ) + def test_preflight_defers_when_recent_real_usage_fit(self, agent): + """A noisy rough estimate should not re-compact a recently fitting request.""" + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 100_000 + agent.context_compressor.last_prompt_tokens = 58_000 + agent.context_compressor.last_real_prompt_tokens = 58_000 + agent.context_compressor.last_rough_tokens_when_real_prompt_fit = 113_000 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded"}) + + ok_resp = _mock_response( + content="Used real fit", + finish_reason="stop", + usage={"prompt_tokens": 59_000, "completion_tokens": 100, "total_tokens": 59_100}, + ) + agent.client.chat.completions.create.side_effect = [ok_resp] + status_messages = [] + agent.status_callback = lambda ev, msg: status_messages.append((ev, msg)) + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=114_000), + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello", conversation_history=big_history) + + mock_compress.assert_not_called() + assert result["completed"] is True + assert result["final_response"] == "Used real fit" + assert not any( + ev == "lifecycle" and "Preflight compression" in msg + for ev, msg in status_messages + ) + + def test_preflight_compresses_when_rough_growth_after_fit_is_large(self, agent): + """Large rough growth after a fitting request still triggers preflight.""" + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 100_000 + agent.context_compressor.last_prompt_tokens = 58_000 + agent.context_compressor.last_real_prompt_tokens = 58_000 + agent.context_compressor.last_rough_tokens_when_real_prompt_fit = 113_000 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded"}) + + ok_resp = _mock_response( + content="Compressed after growth", + finish_reason="stop", + usage={"prompt_tokens": 50_000, "completion_tokens": 100, "total_tokens": 50_100}, + ) + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", side_effect=[125_000, 40_000]), + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + mock_compress.return_value = ( + [{"role": "user", "content": f"{SUMMARY_PREFIX}\nPrevious conversation"}], + "new system prompt", + ) + result = agent.run_conversation("hello", conversation_history=big_history) + + mock_compress.assert_called_once() + assert result["completed"] is True + def test_no_preflight_when_under_threshold(self, agent): """When history fits within context, no preflight compression needed.""" agent.compression_enabled = True From 9dbc3722aeb3fba31adfa181c4b05049d8c997bf Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 17:26:30 -0700 Subject: [PATCH 03/89] test(compression): fix StopIteration in large-rough-growth preflight test The rough-estimate mock supplied only 2 side_effect values but the conversation loop calls estimate_request_tokens_rough a third time for the post-response real-token estimate, exhausting the iterator. Use a callable side_effect that returns 125k once (to fire preflight) then sub-threshold values, independent of call count. --- tests/run_agent/test_413_compression.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py index 37cafa7985d..a2838d7cfbd 100644 --- a/tests/run_agent/test_413_compression.py +++ b/tests/run_agent/test_413_compression.py @@ -552,8 +552,21 @@ class TestPreflightCompression: ) agent.client.chat.completions.create.side_effect = [ok_resp] + # First rough estimate must clear the threshold so preflight fires + # (rough growth since the last fitting request is large, so the + # deferral path is NOT taken). Every estimate after compaction is + # sub-threshold. Use a callable side_effect rather than a fixed list + # so we don't have to predict how many times the loop re-estimates — + # the post-response real-token estimate is an extra call that a + # 2-element list would exhaust (StopIteration). + _rough_calls = {"n": 0} + + def _rough_estimate(*_args, **_kwargs): + _rough_calls["n"] += 1 + return 125_000 if _rough_calls["n"] == 1 else 40_000 + with ( - patch("agent.conversation_loop.estimate_request_tokens_rough", side_effect=[125_000, 40_000]), + patch("agent.conversation_loop.estimate_request_tokens_rough", side_effect=_rough_estimate), patch.object(agent, "_compress_context") as mock_compress, patch.object(agent, "_persist_session"), patch.object(agent, "_save_trajectory"), From 45bc65abbe4767b327cea3b44300a25e5e7d97aa Mon Sep 17 00:00:00 2001 From: Bartok9 Date: Fri, 29 May 2026 08:51:41 -0400 Subject: [PATCH 04/89] fix(gateway): drop outbound silence-narration messages pre-send MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hallucinated 'silence' tokens (*(silent)*, _silent_, the bare '.', '...', 'silent', no response/reply, the mute emoji) are emitted when a persona has nothing actionable to say. In bot-to-bot channels the receiving bot mirrors the token back, creating a tight loop that burns API tokens and can crash a model with 'no content after all retries'. SOUL.md/prompt rules drift across providers and have already failed in practice, so add a substrate-level guard. _deliver_to_platform now drops a message whose finalized content is only a silence-narration token, logs a WARNING with platform/chat_id/truncated content, and returns {success: True, filtered: 'silence_narration', delivered: False} instead of calling the adapter. Single chokepoint covers every platform adapter; the regex is anchored start/end with a 64-char guard so prose like 'Silence is golden — here is the plan...' or 'Silent install completed' is never dropped. Local/file delivery is a separate path and is left untouched. Opt out via gateway.filter_silence_narration: false or the HERMES_FILTER_SILENCE_NARRATION env override (env wins when set). Closes #34616 --- gateway/config.py | 16 ++ gateway/delivery.py | 61 ++++++ tests/gateway/test_delivery_silence_filter.py | 202 ++++++++++++++++++ 3 files changed, 279 insertions(+) create mode 100644 tests/gateway/test_delivery_silence_filter.py diff --git a/gateway/config.py b/gateway/config.py index 6f30ee70643..d8ed3ebe827 100644 --- a/gateway/config.py +++ b/gateway/config.py @@ -474,6 +474,13 @@ class GatewayConfig: # Delivery settings always_log_local: bool = True # Always save cron outputs to local files + # Drop outbound "silence narration" messages (e.g. *(silent)*, 🔇, a bare + # ".") pre-send. These are model hallucinations emitted when a persona has + # nothing actionable to say; in bot-to-bot channels they mirror back and + # forth, burning tokens and crashing models. Substrate-level guard that + # survives SOUL.md/prompt drift across providers. Opt out with False for + # raw passthrough. + filter_silence_narration: bool = True # STT settings stt_enabled: bool = True # Whether to auto-transcribe inbound voice messages @@ -582,6 +589,7 @@ class GatewayConfig: "quick_commands": self.quick_commands, "sessions_dir": str(self.sessions_dir), "always_log_local": self.always_log_local, + "filter_silence_narration": self.filter_silence_narration, "stt_enabled": self.stt_enabled, "group_sessions_per_user": self.group_sessions_per_user, "thread_sessions_per_user": self.thread_sessions_per_user, @@ -650,6 +658,9 @@ class GatewayConfig: quick_commands=quick_commands, sessions_dir=sessions_dir, always_log_local=_coerce_bool(data.get("always_log_local"), True), + filter_silence_narration=_coerce_bool( + data.get("filter_silence_narration"), True + ), stt_enabled=_coerce_bool(stt_enabled, True), group_sessions_per_user=_coerce_bool(group_sessions_per_user, True), thread_sessions_per_user=_coerce_bool(thread_sessions_per_user, False), @@ -757,6 +768,11 @@ def load_gateway_config() -> GatewayConfig: if "always_log_local" in yaml_cfg: gw_data["always_log_local"] = yaml_cfg["always_log_local"] + if "filter_silence_narration" in yaml_cfg: + gw_data["filter_silence_narration"] = yaml_cfg[ + "filter_silence_narration" + ] + if "unauthorized_dm_behavior" in yaml_cfg: gw_data["unauthorized_dm_behavior"] = _normalize_unauthorized_dm_behavior( yaml_cfg.get("unauthorized_dm_behavior"), diff --git a/gateway/delivery.py b/gateway/delivery.py index a1cbb299384..8afab431c36 100644 --- a/gateway/delivery.py +++ b/gateway/delivery.py @@ -9,6 +9,8 @@ Routes messages to the appropriate destination based on: """ import logging +import os +import re from pathlib import Path from datetime import datetime from dataclasses import dataclass @@ -21,6 +23,32 @@ logger = logging.getLogger(__name__) MAX_PLATFORM_OUTPUT = 4000 TRUNCATED_VISIBLE = 3800 +# Matches strings that are *only* a "silence" narration with optional markdown +# wrappers. Covers: *(silent)*, _silent_, `silent`, ~silent~, (silent), silent, +# 🔇, a bare ".", "…", and the whitespace/marker-padded variants seen in the +# wild. Anchored to start/end so substantive messages that merely *contain* the +# word "silent" are never matched. +_SILENCE_NARRATION = re.compile( + r'^[\s*_~`]*\(?\s*(silent|silence|no\s+response|no\s+reply)\s*\.?\)?[\s*_~`]*$' + r'|^[\s*_~`]*[\U0001F507\.\u2026]+[\s*_~`]*$', + re.IGNORECASE, +) + + +def _is_silence_narration(content: Optional[str]) -> bool: + """Return True when ``content`` is *only* a silence-narration token. + + Length-guarded (real messages are longer) and anchored to the whole string + so legitimate prose like "The deployment ran silently" or "Silence is + golden — here is the plan..." is never flagged. + """ + if not content: + return False + stripped = content.strip() + if not stripped or len(stripped) > 64: # length guard + return False + return bool(_SILENCE_NARRATION.match(stripped)) + from .config import Platform, GatewayConfig from .session import SessionSource @@ -261,6 +289,18 @@ class DeliveryRouter: path.write_text(content) return path + def _filter_silence_narration_enabled(self) -> bool: + """Whether the outbound silence-narration filter is active. + + ``HERMES_FILTER_SILENCE_NARRATION`` env var overrides config when set; + otherwise the ``gateway.filter_silence_narration`` config flag wins + (default True). + """ + env = os.getenv("HERMES_FILTER_SILENCE_NARRATION") + if env is not None: + return env.strip().lower() in ("1", "true", "yes", "on") + return bool(getattr(self.config, "filter_silence_narration", True)) + async def _deliver_to_platform( self, target: DeliveryTarget, @@ -286,6 +326,27 @@ class DeliveryRouter: + f"\n\n... [truncated, full output saved to {saved_path}]" ) + # Substrate-level anti-loop guard: drop hallucinated "silence narration" + # (*(silent)*, 🔇, a bare ".", etc.) before it ever reaches the adapter. + # In bot-to-bot channels these tokens mirror back and forth until a + # model crashes with "no content after all retries". Behavioral prompt + # rules drift across providers; this single chokepoint covers every + # platform adapter regardless of which persona's prompt failed. + # Local/file delivery (_deliver_local) is a separate path and is never + # filtered — saved silence has no loop risk. + if self._filter_silence_narration_enabled() and _is_silence_narration(content): + logger.warning( + "Dropped silence-narration outbound to %s (chat=%s): %r", + target.platform.value, + target.chat_id, + content[:40], + ) + return { + "success": True, + "filtered": "silence_narration", + "delivered": False, + } + send_metadata = dict(metadata or {}) is_named_telegram_private_topic = False named_telegram_private_topic_name: Optional[str] = None diff --git a/tests/gateway/test_delivery_silence_filter.py b/tests/gateway/test_delivery_silence_filter.py new file mode 100644 index 00000000000..d52d9876997 --- /dev/null +++ b/tests/gateway/test_delivery_silence_filter.py @@ -0,0 +1,202 @@ +"""Tests for the outbound silence-narration filter (anti-loop control). + +See the gateway delivery path: hallucinated "silence" tokens like ``*(silent)*`` +are dropped pre-send so bot-to-bot channels can't mirror them into a token-burning +loop that crashes a model with "no content after all retries". +""" + +import pytest + +from gateway.config import GatewayConfig, Platform +from gateway.delivery import ( + DeliveryRouter, + DeliveryTarget, + _is_silence_narration, +) + + +# --- Truth table ----------------------------------------------------------- + +POSITIVE_CASES = [ + "*(silent)*", + "*Silence.*", + "🔇", + ".", + "…", + "...", + "(silent)", + "_silent_", + "silent", + " *(silent)* ", + "`silent`", + "~silent~", + "Silence", + "no response", + "No Reply.", +] + +NEGATIVE_CASES = [ + "Silence is golden — here is the plan...", + "Silent install completed", + "The deployment ran silently in the background", + "ok", + "👍", + "Here is the result:\n\n- item one\n- item two", + "I have nothing to add, but here is why: the build is green.", + "silently", # word boundary — trailing letters mean it isn't a bare token + "no responses were collected from the survey", + # A 64+ char string that opens with a silence token must not be dropped. + "silent " + "x" * 70, + "", + " ", +] + + +@pytest.mark.parametrize("content", POSITIVE_CASES) +def test_is_silence_narration_positive(content): + assert _is_silence_narration(content) is True + + +@pytest.mark.parametrize("content", NEGATIVE_CASES) +def test_is_silence_narration_negative(content): + assert _is_silence_narration(content) is False + + +def test_is_silence_narration_none_safe(): + assert _is_silence_narration(None) is False + + +def test_length_guard_rejects_long_strings(): + # Exactly 65 chars of dots — over the 64-char guard, so not treated as narration. + assert _is_silence_narration("." * 65) is False + assert _is_silence_narration("." * 64) is True + + +# --- Integration through DeliveryRouter ------------------------------------ + +class RecordingAdapter: + def __init__(self): + self.calls = [] + + async def send(self, chat_id, content, metadata=None): + self.calls.append({"chat_id": chat_id, "content": content, "metadata": metadata}) + return {"success": True} + + +@pytest.mark.asyncio +async def test_silence_narration_dropped_pre_send(tmp_path, monkeypatch): + monkeypatch.setattr("gateway.delivery.get_hermes_home", lambda: tmp_path) + monkeypatch.delenv("HERMES_FILTER_SILENCE_NARRATION", raising=False) + adapter = RecordingAdapter() + router = DeliveryRouter(GatewayConfig(), adapters={Platform.DISCORD: adapter}) + target = DeliveryTarget.parse("discord:99887766") + + result = await router._deliver_to_platform(target, "*(silent)*", metadata=None) + + assert adapter.calls == [] # adapter.send never invoked + assert result == { + "success": True, + "filtered": "silence_narration", + "delivered": False, + } + + +@pytest.mark.asyncio +async def test_real_message_is_delivered(tmp_path, monkeypatch): + monkeypatch.setattr("gateway.delivery.get_hermes_home", lambda: tmp_path) + monkeypatch.delenv("HERMES_FILTER_SILENCE_NARRATION", raising=False) + adapter = RecordingAdapter() + router = DeliveryRouter(GatewayConfig(), adapters={Platform.DISCORD: adapter}) + target = DeliveryTarget.parse("discord:99887766") + + result = await router._deliver_to_platform( + target, "Silence is golden — here is the plan...", metadata=None + ) + + assert len(adapter.calls) == 1 + assert adapter.calls[0]["content"] == "Silence is golden — here is the plan..." + assert result == {"success": True} + + +@pytest.mark.asyncio +async def test_config_opt_out_lets_silence_through(tmp_path, monkeypatch): + monkeypatch.setattr("gateway.delivery.get_hermes_home", lambda: tmp_path) + monkeypatch.delenv("HERMES_FILTER_SILENCE_NARRATION", raising=False) + adapter = RecordingAdapter() + config = GatewayConfig(filter_silence_narration=False) + router = DeliveryRouter(config, adapters={Platform.DISCORD: adapter}) + target = DeliveryTarget.parse("discord:99887766") + + result = await router._deliver_to_platform(target, "*(silent)*", metadata=None) + + assert len(adapter.calls) == 1 + assert adapter.calls[0]["content"] == "*(silent)*" + assert result == {"success": True} + + +@pytest.mark.asyncio +async def test_env_override_disables_filter(tmp_path, monkeypatch): + monkeypatch.setattr("gateway.delivery.get_hermes_home", lambda: tmp_path) + monkeypatch.setenv("HERMES_FILTER_SILENCE_NARRATION", "0") + adapter = RecordingAdapter() + # Config default is True, but env override wins. + router = DeliveryRouter(GatewayConfig(), adapters={Platform.DISCORD: adapter}) + target = DeliveryTarget.parse("discord:99887766") + + result = await router._deliver_to_platform(target, "🔇", metadata=None) + + assert len(adapter.calls) == 1 + assert result == {"success": True} + + +@pytest.mark.asyncio +async def test_env_override_enables_filter_over_config(tmp_path, monkeypatch): + monkeypatch.setattr("gateway.delivery.get_hermes_home", lambda: tmp_path) + monkeypatch.setenv("HERMES_FILTER_SILENCE_NARRATION", "1") + adapter = RecordingAdapter() + # Config says off, env override forces on. + config = GatewayConfig(filter_silence_narration=False) + router = DeliveryRouter(config, adapters={Platform.DISCORD: adapter}) + target = DeliveryTarget.parse("discord:99887766") + + result = await router._deliver_to_platform(target, "*(silent)*", metadata=None) + + assert adapter.calls == [] + assert result["filtered"] == "silence_narration" + + +@pytest.mark.asyncio +async def test_local_delivery_not_filtered(tmp_path, monkeypatch): + monkeypatch.setattr("gateway.delivery.get_hermes_home", lambda: tmp_path) + monkeypatch.delenv("HERMES_FILTER_SILENCE_NARRATION", raising=False) + router = DeliveryRouter(GatewayConfig(), adapters={}) + + results = await router.deliver( + content="*(silent)*", + targets=[DeliveryTarget.parse("local")], + job_id="silence-job", + ) + + # Local path saved the file (no loop risk) and was not filtered. + local_result = results["local"] + assert local_result["success"] is True + saved_path = local_result["result"]["path"] + assert saved_path.endswith(".md") + + +# --- Config round-trip ------------------------------------------------------ + +def test_config_flag_defaults_true(): + assert GatewayConfig().filter_silence_narration is True + + +def test_config_from_dict_parses_flag(): + cfg = GatewayConfig.from_dict({"filter_silence_narration": False}) + assert cfg.filter_silence_narration is False + + +def test_config_to_dict_roundtrip(): + cfg = GatewayConfig(filter_silence_narration=False) + assert cfg.to_dict()["filter_silence_narration"] is False + restored = GatewayConfig.from_dict(cfg.to_dict()) + assert restored.filter_silence_narration is False From 2259c15e4d6f80d026d555c1c4b7019581283a82 Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Fri, 29 May 2026 15:47:48 -0600 Subject: [PATCH 05/89] fix(gateway): clarify status session usage label --- locales/af.yaml | 2 +- locales/de.yaml | 2 +- locales/en.yaml | 2 +- locales/es.yaml | 2 +- locales/pt.yaml | 2 +- tests/gateway/test_status_command.py | 6 +++--- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/locales/af.yaml b/locales/af.yaml index 636bae754f3..fd78cdb3f1e 100644 --- a/locales/af.yaml +++ b/locales/af.yaml @@ -255,7 +255,7 @@ gateway: title: "**Titel:** {title}" created: "**Geskep:** {timestamp}" last_activity: "**Laaste aktiwiteit:** {timestamp}" - tokens: "**Tokens:** {tokens}" + tokens: "**Sessiegebruik (kumulatief):** {tokens}" agent_running: "**Agent loop:** {state}" state_yes: "Ja ⚡" state_no: "Nee" diff --git a/locales/de.yaml b/locales/de.yaml index f400dd9fb2e..1079cb8d28e 100644 --- a/locales/de.yaml +++ b/locales/de.yaml @@ -255,7 +255,7 @@ gateway: title: "**Titel:** {title}" created: "**Erstellt:** {timestamp}" last_activity: "**Letzte Aktivität:** {timestamp}" - tokens: "**Tokens:** {tokens}" + tokens: "**Sitzungsnutzung (kumulativ):** {tokens}" agent_running: "**Agent läuft:** {state}" state_yes: "Ja ⚡" state_no: "Nein" diff --git a/locales/en.yaml b/locales/en.yaml index 88d18a2f892..4d09efea410 100644 --- a/locales/en.yaml +++ b/locales/en.yaml @@ -270,7 +270,7 @@ gateway: title: "**Title:** {title}" created: "**Created:** {timestamp}" last_activity: "**Last Activity:** {timestamp}" - tokens: "**Tokens:** {tokens}" + tokens: "**Session usage (cumulative):** {tokens}" agent_running: "**Agent Running:** {state}" state_yes: "Yes ⚡" state_no: "No" diff --git a/locales/es.yaml b/locales/es.yaml index 08aaf9ad0b4..d798c4858de 100644 --- a/locales/es.yaml +++ b/locales/es.yaml @@ -255,7 +255,7 @@ gateway: title: "**Título:** {title}" created: "**Creado:** {timestamp}" last_activity: "**Última actividad:** {timestamp}" - tokens: "**Tokens:** {tokens}" + tokens: "**Uso de sesión (acumulado):** {tokens}" agent_running: "**Agente activo:** {state}" state_yes: "Sí ⚡" state_no: "No" diff --git a/locales/pt.yaml b/locales/pt.yaml index 0c0eddad91e..cbba969500f 100644 --- a/locales/pt.yaml +++ b/locales/pt.yaml @@ -255,7 +255,7 @@ gateway: title: "**Título:** {title}" created: "**Criada:** {timestamp}" last_activity: "**Última atividade:** {timestamp}" - tokens: "**Tokens:** {tokens}" + tokens: "**Uso da sessão (cumulativo):** {tokens}" agent_running: "**Agente em execução:** {state}" state_yes: "Sim ⚡" state_no: "Não" diff --git a/tests/gateway/test_status_command.py b/tests/gateway/test_status_command.py index 01222597224..9ff824a8fff 100644 --- a/tests/gateway/test_status_command.py +++ b/tests/gateway/test_status_command.py @@ -97,7 +97,7 @@ async def test_status_command_reports_running_agent_without_interrupt(monkeypatc result = await runner._handle_message(_make_event("/status")) assert "**Session ID:** `sess-1`" in result - assert "**Tokens:** 321" in result + assert "**Session usage (cumulative):** 321" in result assert "**Agent Running:** Yes ⚡" in result assert "**Title:**" not in result running_agent.interrupt.assert_not_called() @@ -150,7 +150,7 @@ async def test_status_command_reads_token_totals_from_session_db(): result = await runner._handle_message(_make_event("/status")) # 1000 + 250 + 500 + 100 + 50 = 1,900 - assert "**Tokens:** 1,900" in result + assert "**Session usage (cumulative):** 1,900" in result @pytest.mark.asyncio @@ -171,7 +171,7 @@ async def test_status_command_tokens_zero_when_session_db_row_missing(): result = await runner._handle_message(_make_event("/status")) - assert "**Tokens:** 0" in result + assert "**Session usage (cumulative):** 0" in result @pytest.mark.asyncio From 9d4c81130a39f4a725b8301610d52c7cbff06fc6 Mon Sep 17 00:00:00 2001 From: teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 19:02:49 -0700 Subject: [PATCH 06/89] fix(gateway): name what the /status token number actually is MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sharpen the label from 'Session usage (cumulative)' to 'Cumulative API tokens (re-sent each call)'. The number is real provider-reported usage summed across every API call in the session — not context size. In an agentic loop the same context is re-sent each iteration, so a one-hour tool-heavy session legitimately reaches tens of millions of tokens. The new label explains the magnitude so users stop reading it as a bug or as a total across all sessions. --- locales/af.yaml | 2 +- locales/de.yaml | 2 +- locales/en.yaml | 2 +- locales/es.yaml | 2 +- locales/pt.yaml | 2 +- tests/gateway/test_status_command.py | 6 +++--- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/locales/af.yaml b/locales/af.yaml index fd78cdb3f1e..a64e759c441 100644 --- a/locales/af.yaml +++ b/locales/af.yaml @@ -255,7 +255,7 @@ gateway: title: "**Titel:** {title}" created: "**Geskep:** {timestamp}" last_activity: "**Laaste aktiwiteit:** {timestamp}" - tokens: "**Sessiegebruik (kumulatief):** {tokens}" + tokens: "**Kumulatiewe API-tokens (elke oproep weer gestuur):** {tokens}" agent_running: "**Agent loop:** {state}" state_yes: "Ja ⚡" state_no: "Nee" diff --git a/locales/de.yaml b/locales/de.yaml index 1079cb8d28e..4b84f2e4b66 100644 --- a/locales/de.yaml +++ b/locales/de.yaml @@ -255,7 +255,7 @@ gateway: title: "**Titel:** {title}" created: "**Erstellt:** {timestamp}" last_activity: "**Letzte Aktivität:** {timestamp}" - tokens: "**Sitzungsnutzung (kumulativ):** {tokens}" + tokens: "**Kumulierte API-Tokens (bei jedem Aufruf erneut gesendet):** {tokens}" agent_running: "**Agent läuft:** {state}" state_yes: "Ja ⚡" state_no: "Nein" diff --git a/locales/en.yaml b/locales/en.yaml index 4d09efea410..93d7ffdc433 100644 --- a/locales/en.yaml +++ b/locales/en.yaml @@ -270,7 +270,7 @@ gateway: title: "**Title:** {title}" created: "**Created:** {timestamp}" last_activity: "**Last Activity:** {timestamp}" - tokens: "**Session usage (cumulative):** {tokens}" + tokens: "**Cumulative API tokens (re-sent each call):** {tokens}" agent_running: "**Agent Running:** {state}" state_yes: "Yes ⚡" state_no: "No" diff --git a/locales/es.yaml b/locales/es.yaml index d798c4858de..6a3cccb66a4 100644 --- a/locales/es.yaml +++ b/locales/es.yaml @@ -255,7 +255,7 @@ gateway: title: "**Título:** {title}" created: "**Creado:** {timestamp}" last_activity: "**Última actividad:** {timestamp}" - tokens: "**Uso de sesión (acumulado):** {tokens}" + tokens: "**Tokens de API acumulados (reenviados en cada llamada):** {tokens}" agent_running: "**Agente activo:** {state}" state_yes: "Sí ⚡" state_no: "No" diff --git a/locales/pt.yaml b/locales/pt.yaml index cbba969500f..662971f08b7 100644 --- a/locales/pt.yaml +++ b/locales/pt.yaml @@ -255,7 +255,7 @@ gateway: title: "**Título:** {title}" created: "**Criada:** {timestamp}" last_activity: "**Última atividade:** {timestamp}" - tokens: "**Uso da sessão (cumulativo):** {tokens}" + tokens: "**Tokens de API cumulativos (reenviados a cada chamada):** {tokens}" agent_running: "**Agente em execução:** {state}" state_yes: "Sim ⚡" state_no: "Não" diff --git a/tests/gateway/test_status_command.py b/tests/gateway/test_status_command.py index 9ff824a8fff..0b88d271808 100644 --- a/tests/gateway/test_status_command.py +++ b/tests/gateway/test_status_command.py @@ -97,7 +97,7 @@ async def test_status_command_reports_running_agent_without_interrupt(monkeypatc result = await runner._handle_message(_make_event("/status")) assert "**Session ID:** `sess-1`" in result - assert "**Session usage (cumulative):** 321" in result + assert "**Cumulative API tokens (re-sent each call):** 321" in result assert "**Agent Running:** Yes ⚡" in result assert "**Title:**" not in result running_agent.interrupt.assert_not_called() @@ -150,7 +150,7 @@ async def test_status_command_reads_token_totals_from_session_db(): result = await runner._handle_message(_make_event("/status")) # 1000 + 250 + 500 + 100 + 50 = 1,900 - assert "**Session usage (cumulative):** 1,900" in result + assert "**Cumulative API tokens (re-sent each call):** 1,900" in result @pytest.mark.asyncio @@ -171,7 +171,7 @@ async def test_status_command_tokens_zero_when_session_db_row_missing(): result = await runner._handle_message(_make_event("/status")) - assert "**Session usage (cumulative):** 0" in result + assert "**Cumulative API tokens (re-sent each call):** 0" in result @pytest.mark.asyncio From 897f9533ed511345d0a729af507abdb2308cfbcb Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 19:21:15 -0700 Subject: [PATCH 07/89] fix: keep CLI context display in sync with preflight token estimate (#35079) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Inspired by Claude Code: /compress here [N] — boundary-aware 'summarize up to here' Adds a user-chosen compression boundary to the existing /compress command. /compress here [N] summarizes everything except the most recent N exchanges (default 2), which are preserved verbatim — letting the user pick the compression boundary instead of relying on the automatic token-budget heuristic. Inspired by Claude Code's Rewind 'Summarize up to here' action (v2.1.139, Week 20, May 2026): https://code.claude.com/docs/en/whats-new/2026-w20 - hermes_cli/partial_compress.py: pure split/parse helpers + seam-alternation guard (shared by CLI and gateway). - cli.py / gateway/run.py: route 'here [N]' / '--keep N' to partial compression; compress only the head, re-append the verbatim tail through the seam guard. - Preserves message-flow role alternation (seam guard merges any illegal user->user / assistant->assistant adjacency). - Reuses the existing _compress_context session-rotation/lock machinery — no changes to the compression core. - Bare /compress (full) and /compress behavior unchanged. Tests: 12 helper unit tests + 5 CLI integration tests + E2E (interleaved tool-call transcript, degenerate/multimodal seams, real handler path). * fix: keep CLI context display in sync with preflight token estimate The status bar reads compressor.last_prompt_tokens, which only updates from a successful API response. When loaded history is oversized but compression no-ops (e.g. the auxiliary summary model times out), no fresh usage arrives and the bar stays frozen at the old, smaller value while the preflight estimate reports a much larger number — looking permanently out of sync (reported: 74.4K display vs ~144,669 preflight). Seed last_prompt_tokens with the fresh preflight estimate (upward-only, so a real usage figure is never clobbered and a successful compression's downward correction still wins). Display-only; no behavioral change to compression, caching, or the agent loop. --- agent/conversation_loop.py | 20 +++++++- tests/run_agent/test_413_compression.py | 68 +++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index f72014b9c0a..e23a513aa51 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -606,8 +606,26 @@ def run_conversation( "should_defer_preflight_to_real_usage", lambda _tokens: False, ) + _preflight_deferred = _defer_preflight(_preflight_tokens) - if _defer_preflight(_preflight_tokens): + if not _preflight_deferred: + # Keep the CLI/ACP context display in sync with what preflight + # actually measured. The status bar reads + # ``compressor.last_prompt_tokens``, which otherwise only updates + # from a *successful* API response. When the conversation has grown + # since the last successful call — or when compression then fails + # (e.g. the auxiliary summary model times out) and no fresh usage + # arrives — the bar stays stuck at the old, smaller value while + # preflight reports a much larger number, looking out of sync. + # Seed it with the fresh estimate (only ever revising upward; a real + # ``update_from_response`` will correct it after the next API call). + # Skipped when deferring — a deferred estimate is known to over-count + # vs the last real provider prompt, so trusting it for the display + # would re-introduce the very desync we're avoiding. + if _preflight_tokens > (_compressor.last_prompt_tokens or 0): + _compressor.last_prompt_tokens = _preflight_tokens + + if _preflight_deferred: logger.info( "Skipping preflight compression: rough estimate ~%s >= %s, " "but last real provider prompt was %s after compression", diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py index a2838d7cfbd..cadb26c449b 100644 --- a/tests/run_agent/test_413_compression.py +++ b/tests/run_agent/test_413_compression.py @@ -665,6 +665,74 @@ class TestPreflightCompression: mock_compress.assert_not_called() assert result["completed"] is True + def test_preflight_seeds_display_tokens_when_compression_aborts(self, agent): + """Display must reflect the real context size even when compression no-ops. + + Regression: the CLI status bar reads ``last_prompt_tokens``, which only + updated from a *successful* API response. When the loaded history was + oversized but compression failed to reduce it (e.g. the auxiliary + summary model timed out), the bar stayed stuck at the old, smaller + value while the preflight estimate reported a much larger number — + looking permanently out of sync. + """ + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 130_000 + # Simulate a stale display value from an earlier, smaller turn. + agent.context_compressor.last_prompt_tokens = 74_400 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded text"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded text"}) + + ok_resp = _mock_response(content="After preflight", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669), + # Compression no-ops (returns input unchanged) — mirrors an aux + # summary-model timeout where the messages can't be reduced. + patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)), + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello", conversation_history=big_history) + + assert result["completed"] is True + # The display token count was revised up to the fresh preflight estimate, + # not left at the stale 74_400. + assert agent.context_compressor.last_prompt_tokens == 144_669 + + def test_preflight_seed_only_revises_upward(self, agent): + """A larger tracked value must not be clobbered by a smaller estimate.""" + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 130_000 + # A real, larger usage figure is already tracked. + agent.context_compressor.last_prompt_tokens = 160_000 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded text"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded text"}) + + ok_resp = _mock_response(content="After preflight", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669), + patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)), + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + agent.run_conversation("hello", conversation_history=big_history) + + # Smaller estimate must not overwrite the larger tracked value. + assert agent.context_compressor.last_prompt_tokens == 160_000 + class TestToolResultPreflightCompression: """Compression should trigger when tool results push context past the threshold.""" From 59b0ea98c8956a2fd1e875a673423d30175b7f9b Mon Sep 17 00:00:00 2001 From: Bartok9 Date: Fri, 29 May 2026 03:40:51 -0400 Subject: [PATCH 08/89] fix(agent): explain abnormal turn endings instead of blank/partial reply When a turn ends abnormally after substantive tool calls (empty content after retries, a partial/truncated stream, exhausted retries, or an iteration/budget limit), the CLI/TUI response area was left blank or showed only a fragment (e.g. "The") with no consolidated reason. The internal turn_exit_reason values (empty_response_exhausted, partial_stream_recovery, etc.) were never surfaced to the user. Add a turn-completion explainer that mirrors the existing file-mutation verifier footer: at turn end, map an abnormal turn_exit_reason to a short, actionable message and either replace the bare "(empty)" sentinel or append the reason after a partial fragment. Normal text_response exits (e.g. a terse "Done.") stay quiet. Gated by display.turn_completion_explainer (default on) with HERMES_TURN_COMPLETION_EXPLAINER env override, matching the file-mutation verifier seam. Closes #34452 --- agent/conversation_loop.py | 49 +++++ run_agent.py | 120 ++++++++++++ tests/run_agent/test_run_agent.py | 23 ++- .../test_turn_completion_explainer.py | 181 ++++++++++++++++++ 4 files changed, 368 insertions(+), 5 deletions(-) create mode 100644 tests/run_agent/test_turn_completion_explainer.py diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index e23a513aa51..cf77d9a1b51 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -4480,6 +4480,55 @@ def run_conversation( except Exception as _ver_err: logger.debug("file-mutation verifier footer failed: %s", _ver_err) + # Turn-completion explainer. + # When a turn ends abnormally after substantive work — empty content + # after retries, a partial/truncated stream, a still-pending tool + # result, or an iteration/budget limit — the user otherwise gets a + # blank or fragmentary response box with no consolidated reason why + # the agent stopped (#34452). Surface a single user-visible + # explanation derived from ``_turn_exit_reason``, mirroring the + # file-mutation verifier footer pattern above. + # + # Gate carefully so healthy turns stay quiet: + # - ``text_response(...)`` exits never produce an explanation + # (handled inside the formatter), so a terse ``Done.`` is silent. + # - We only ACT when there is no genuinely usable reply this turn: + # an empty response, the "(empty)" terminal sentinel, or a + # suspiciously short partial fragment with no terminating + # punctuation (e.g. "The"). A real short answer keeps its text. + if not interrupted: + try: + if agent._turn_completion_explainer_enabled(): + _stripped = (final_response or "").strip() + _is_empty_terminal = _stripped == "" or _stripped == "(empty)" + # A short fragment that is not a normal text_response exit + # and lacks sentence-ending punctuation is treated as a + # truncated partial (the "The" case from #34452). + _is_partial_fragment = ( + not _is_empty_terminal + and not str(_turn_exit_reason).startswith("text_response") + and len(_stripped) <= 24 + and _stripped[-1:] not in {".", "!", "?", "。", "!", "?", "`", ")"} + ) + if _is_empty_terminal or _is_partial_fragment: + _explanation = agent._format_turn_completion_explanation( + _turn_exit_reason + ) + if _explanation: + if _is_empty_terminal: + # Replace the bare "(empty)"/blank sentinel with + # the actionable explanation. + final_response = _explanation + else: + # Keep the partial fragment, append the reason so + # the user sees both what arrived and why it + # stopped. + final_response = ( + _stripped + "\n\n" + _explanation + ) + except Exception as _exp_err: + logger.debug("turn-completion explainer failed: %s", _exp_err) + _response_transformed = False # Plugin hook: transform_llm_output diff --git a/run_agent.py b/run_agent.py index 55df748a5a4..a737fbd78bd 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2138,6 +2138,126 @@ class AIAgent: lines.append(f" • … and {remaining} more") return "\n".join(lines) + def _turn_completion_explainer_enabled(self) -> bool: + """Check whether the end-of-turn completion explainer footer is on. + + Config path: ``display.turn_completion_explainer`` (bool, default + True). ``HERMES_TURN_COMPLETION_EXPLAINER`` env var overrides + config. Exposed as a method so tests can patch a single seam, + mirroring ``_file_mutation_verifier_enabled``. + """ + try: + import os as _os + env = _os.environ.get("HERMES_TURN_COMPLETION_EXPLAINER") + if env is not None: + return env.strip().lower() not in {"0", "false", "no", "off"} + # Read from the persisted config.yaml so gateway and CLI share + # the same setting. Import lazily to avoid a startup-time cycle. + try: + from hermes_cli.config import load_config as _load_config + _cfg = _load_config() or {} + except Exception: + _cfg = {} + _display = _cfg.get("display") if isinstance(_cfg, dict) else None + if isinstance(_display, dict) and "turn_completion_explainer" in _display: + return bool(_display.get("turn_completion_explainer")) + except Exception: + pass + return True # safe default: explainer on + + @staticmethod + def _format_turn_completion_explanation(turn_exit_reason: str) -> str: + """Render a user-facing explanation for an abnormal turn ending. + + Maps the internal ``turn_exit_reason`` to a short, actionable + message so a turn that produced no usable assistant reply (empty + content after retries, a partial/truncated stream, a still-pending + tool result, or an iteration/budget limit) is never silent from + the UI's perspective — the symptom users report in #34452. + + Returns an empty string for reasons that are NOT abnormal (e.g. + a normal ``text_response(...)`` exit), so callers can concatenate + or substitute unconditionally without warning on healthy turns + like a terse ``Done.``. + """ + if not turn_exit_reason: + return "" + reason = str(turn_exit_reason) + + # Normal completion — stay quiet. ``text_response(...)`` is the + # healthy terminal; anything that produced a real reply is fine. + if reason.startswith("text_response"): + return "" + + prefix = "⚠️ Turn ended without a usable reply: " + if reason == "empty_response_exhausted": + return ( + prefix + + "the model returned empty content after retries and any " + "fallback providers. Try `continue`, switch model/provider, " + "or inspect the tool output above." + ) + if reason == "all_retries_exhausted_no_response": + return ( + prefix + + "all API retries were exhausted before a response was " + "produced (provider errors / rate limits). Try `continue` " + "or switch provider." + ) + if reason == "partial_stream_recovery": + return ( + prefix + + "streaming stopped early and only a partial response was " + "recovered. Send `continue` to resume from where it stopped." + ) + if reason == "fallback_prior_turn_content": + return ( + prefix + + "no new content was produced this turn; showing recovered " + "prior context. Send `continue` to retry." + ) + if reason == "interrupted_during_api_call": + return ( + prefix + + "the request was interrupted mid-call before a reply was " + "received. Send `continue` to retry." + ) + if reason == "budget_exhausted": + return ( + prefix + + "the per-turn iteration/cost budget was exhausted before a " + "final answer. Send `continue` to keep going." + ) + if reason == "ollama_runtime_context_too_small": + return ( + prefix + + "the local model's context window was too small to finish. " + "Increase the context size or use a larger model." + ) + if reason.startswith("max_iterations_reached"): + return ( + prefix + + "the maximum tool-iteration limit was reached before a " + "final answer. Send `continue` to keep going, or raise " + "`max_iterations`." + ) + if reason.startswith("error_near_max_iterations"): + return ( + prefix + + "an error occurred near the iteration limit before a final " + "answer. Check the tool output above, then send `continue`." + ) + if reason == "pending_tool_result": + return ( + prefix + + "the turn stopped while a tool result was still pending and " + "the model produced no follow-up text. Send `continue` to " + "let it summarize." + ) + # Unknown/diagnostic-only reasons (e.g. "unknown", guardrail_halt + # which already surfaces its own message) — don't second-guess. + return "" + def _apply_pending_steer_to_tool_results(self, messages: list, num_tool_msgs: int) -> None: """Forwarder — see ``agent.agent_runtime_helpers.apply_pending_steer_to_tool_results``.""" from agent.agent_runtime_helpers import apply_pending_steer_to_tool_results diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 07ff74930b0..0da60572c3e 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -3046,7 +3046,11 @@ class TestRunConversation: mock_compress.assert_not_called() # no compression triggered assert result["completed"] is True - assert result["final_response"] == "(empty)" + # #34452: the bare "(empty)" sentinel is now replaced by a + # user-visible end-of-turn explanation so the failure isn't silent. + assert result["final_response"] != "(empty)" + assert "without a usable reply" in result["final_response"] + assert result["turn_exit_reason"] == "empty_response_exhausted" assert result["api_calls"] == 6 # 1 original + 2 prefill + 3 retries def test_reasoning_only_response_prefill_then_empty(self, agent): @@ -3066,7 +3070,9 @@ class TestRunConversation: ): result = agent.run_conversation("answer me") assert result["completed"] is True - assert result["final_response"] == "(empty)" + # #34452: explanation replaces the bare "(empty)" sentinel. + assert result["final_response"] != "(empty)" + assert "without a usable reply" in result["final_response"] assert result["api_calls"] == 6 # 1 original + 2 prefill + 3 retries def test_reasoning_only_prefill_succeeds_on_continuation(self, agent): @@ -3113,7 +3119,9 @@ class TestRunConversation: ): result = agent.run_conversation("answer me") assert result["completed"] is True - assert result["final_response"] == "(empty)" + # #34452: explanation replaces the bare "(empty)" sentinel. + assert result["final_response"] != "(empty)" + assert "without a usable reply" in result["final_response"] assert result["api_calls"] == 4 # 1 original + 3 retries def test_truly_empty_response_succeeds_on_nudge(self, agent): @@ -3209,7 +3217,9 @@ class TestRunConversation: ): result = agent.run_conversation("answer me") assert result["completed"] is True - assert result["final_response"] == "(empty)" + # #34452: explanation replaces the bare "(empty)" sentinel. + assert result["final_response"] != "(empty)" + assert "without a usable reply" in result["final_response"] def test_empty_response_emits_status_for_gateway(self, agent): """_emit_status is called during empty retries so gateway users see feedback.""" @@ -3235,7 +3245,10 @@ class TestRunConversation: ): result = agent.run_conversation("answer me") - assert result["final_response"] == "(empty)" + # #34452: explanation replaces the bare "(empty)" sentinel, but the + # status emissions during retries are unchanged. + assert result["final_response"] != "(empty)" + assert "without a usable reply" in result["final_response"] # Should have emitted retry statuses (3 retries) + final failure retry_msgs = [m for m in status_messages if "retrying" in m.lower()] assert len(retry_msgs) == 3, f"Expected 3 retry status messages, got {len(retry_msgs)}: {status_messages}" diff --git a/tests/run_agent/test_turn_completion_explainer.py b/tests/run_agent/test_turn_completion_explainer.py new file mode 100644 index 00000000000..b120272b04a --- /dev/null +++ b/tests/run_agent/test_turn_completion_explainer.py @@ -0,0 +1,181 @@ +"""Tests for the end-of-turn completion explainer (#34452). + +When a turn ends abnormally after tools (empty content after retries, a +partial/truncated stream, exhausted retries, or an iteration/budget limit) +the user should get a single user-visible explanation of why the reply +stopped instead of a blank or fragmentary response box. Normal short +replies (e.g. ``Done.``) must stay quiet. + +These tests exercise: + 1. ``_format_turn_completion_explanation`` — the pure reason→message map. + 2. ``_turn_completion_explainer_enabled`` — the env/config seam. + 3. An end-to-end ``run_conversation`` turn that exhausts empty-response + retries and verifies the explanation reaches ``final_response``. + +All assertions work under the mocked OpenAI SDK used elsewhere in this +suite (we patch ``run_agent.OpenAI`` and drive ``agent.client``), so they +pass identically in CI and locally. +""" + +import os +import uuid +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +from run_agent import AIAgent + + +# -------------------------------------------------------------------------- +# Fixtures (mirrors tests/run_agent/test_tool_call_guardrail_runtime.py) +# -------------------------------------------------------------------------- +def _mock_response(content="Hello", finish_reason="stop", tool_calls=None): + msg = SimpleNamespace(content=content, tool_calls=tool_calls) + choice = SimpleNamespace(message=msg, finish_reason=finish_reason) + return SimpleNamespace(choices=[choice], model="test/model", usage=None) + + +def _make_agent(max_iterations: int = 10, config: dict | None = None) -> AIAgent: + with ( + patch("run_agent.get_tool_definitions", return_value=[]), + patch("run_agent.check_toolset_requirements", return_value={}), + patch("hermes_cli.config.load_config", return_value=config or {}), + patch("run_agent.OpenAI"), + ): + agent = AIAgent( + api_key="test-key-1234567890", + base_url="https://openrouter.ai/api/v1", + max_iterations=max_iterations, + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + ) + agent.client = MagicMock() + agent._cached_system_prompt = "You are helpful." + agent._use_prompt_caching = False + agent.tool_delay = 0 + agent.compression_enabled = False + agent.save_trajectories = False + # No fallback chain so empty responses exhaust deterministically. + agent._fallback_chain = [] + return agent + + +# -------------------------------------------------------------------------- +# 1. Pure formatter +# -------------------------------------------------------------------------- +def test_explanation_quiet_for_normal_text_response(): + """A healthy text_response exit must NOT produce any explanation.""" + out = AIAgent._format_turn_completion_explanation( + "text_response(finish_reason=stop)" + ) + assert out == "" + + +def test_explanation_quiet_for_empty_reason(): + assert AIAgent._format_turn_completion_explanation("") == "" + assert AIAgent._format_turn_completion_explanation("unknown") == "" + # guardrail_halt surfaces its own message; explainer stays out of the way. + assert AIAgent._format_turn_completion_explanation("guardrail_halt") == "" + + +def test_explanation_for_empty_response_exhausted(): + out = AIAgent._format_turn_completion_explanation("empty_response_exhausted") + assert out # non-empty + assert "empty content" in out + assert "continue" in out.lower() + + +def test_explanation_for_partial_stream_recovery(): + out = AIAgent._format_turn_completion_explanation("partial_stream_recovery") + assert "partial" in out.lower() + assert "continue" in out.lower() + + +def test_explanation_for_max_iterations_reached_prefix_match(): + """``max_iterations_reached(...)`` carries a parenthetical suffix.""" + out = AIAgent._format_turn_completion_explanation( + "max_iterations_reached(10/10)" + ) + assert "iteration" in out.lower() + + +def test_explanation_for_all_retries_exhausted(): + out = AIAgent._format_turn_completion_explanation( + "all_retries_exhausted_no_response" + ) + assert "retries" in out.lower() + + +# -------------------------------------------------------------------------- +# 2. Enable/disable seam +# -------------------------------------------------------------------------- +def test_explainer_enabled_by_default(): + agent = _make_agent() + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("HERMES_TURN_COMPLETION_EXPLAINER", None) + with patch("hermes_cli.config.load_config", return_value={}): + assert agent._turn_completion_explainer_enabled() is True + + +def test_explainer_disabled_via_env(): + agent = _make_agent() + with patch.dict( + os.environ, {"HERMES_TURN_COMPLETION_EXPLAINER": "0"}, clear=False + ): + assert agent._turn_completion_explainer_enabled() is False + + +def test_explainer_disabled_via_config(): + agent = _make_agent() + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("HERMES_TURN_COMPLETION_EXPLAINER", None) + with patch( + "hermes_cli.config.load_config", + return_value={"display": {"turn_completion_explainer": False}}, + ): + assert agent._turn_completion_explainer_enabled() is False + + +# -------------------------------------------------------------------------- +# 3. End-to-end: empty-response exhaustion surfaces the explanation +# -------------------------------------------------------------------------- +def test_run_conversation_empty_exhausted_surfaces_explanation(): + """Four empty responses in a row should exhaust retries and the final + response should be the actionable explanation, not a bare '(empty)'.""" + agent = _make_agent(max_iterations=10) + # 4 empty responses: retries 1..3 then the terminal on the 4th. + agent.client.chat.completions.create.side_effect = [ + _mock_response(content="", finish_reason="stop") for _ in range(8) + ] + + with ( + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("do something") + + assert result["turn_exit_reason"] == "empty_response_exhausted" + # The user must NOT be left with a bare sentinel; the explanation wins. + assert result["final_response"] != "(empty)" + assert result["final_response"].strip() != "" + assert "without a usable reply" in result["final_response"] + + +def test_run_conversation_normal_reply_stays_quiet(): + """A normal short reply like 'Done.' must NOT get an explainer footer.""" + agent = _make_agent(max_iterations=10) + agent.client.chat.completions.create.side_effect = [ + _mock_response(content="Done.", finish_reason="stop"), + ] + + with ( + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("do something") + + assert result["turn_exit_reason"].startswith("text_response") + assert result["final_response"] == "Done." + assert "without a usable reply" not in result["final_response"] From de6d6023d7486dcaa757037f2e3ba13985302aca Mon Sep 17 00:00:00 2001 From: Bartok9 Date: Fri, 29 May 2026 11:55:48 -0400 Subject: [PATCH 09/89] test(run_agent): align test_dict_tool_call_args with explainer suffix PR #34470 adds an explainer suffix to abnormal turn endings (e.g. max_iterations_reached) so users see why the response is short instead of receiving a bare/blank reply. test_tool_call_validation_accepts_dict_arguments runs the agent at max_iterations=3 which hits the explainer path; the existing strict-equality assertion (== "done") no longer matches once the suffix is appended. Switch the assertion to .startswith("done") so the test continues to verify that the models actual text survives intact while leaving the explainer suffix wording owned by conversation_loop (where it belongs). Test now passes (1 passed in 0.88s). --- tests/run_agent/test_dict_tool_call_args.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/run_agent/test_dict_tool_call_args.py b/tests/run_agent/test_dict_tool_call_args.py index 61ee6fc5c28..ac249919fa1 100644 --- a/tests/run_agent/test_dict_tool_call_args.py +++ b/tests/run_agent/test_dict_tool_call_args.py @@ -70,4 +70,9 @@ def test_tool_call_validation_accepts_dict_arguments(monkeypatch): result = agent.run_conversation("read the file") - assert result["final_response"] == "done" + # The conversation hits max_iterations=3 (3 tool turns then forced summary). + # PR #34470 adds an explainer suffix to abnormal turn endings so users + # understand why the response is short instead of seeing a blank reply. + # The exact suffix wording is owned by conversation_loop; this test only + # cares that the model's actual text ('done') survives at the start. + assert result["final_response"].startswith("done") From fb0ab27649bac911bec4330d29cf4376d75a2552 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 13:20:13 -0700 Subject: [PATCH 10/89] fix(agent): register explainer config key + shorten footer prefix Follow-up to the salvaged #34452 turn-completion explainer: - Register display.turn_completion_explainer: True in DEFAULT_CONFIG so the setting is discoverable, matching the file_mutation_verifier precedent. - Shorten the repeated footer prefix from 'Turn ended without a usable reply: ' to 'No reply: ' so the 10 reason variants don't all open with the same 8-word boilerplate. - Update the 7 assertions that referenced the old prefix. --- hermes_cli/config.py | 7 +++++++ run_agent.py | 2 +- tests/run_agent/test_run_agent.py | 10 +++++----- tests/run_agent/test_turn_completion_explainer.py | 4 ++-- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index e2c59a694fe..87aac11b864 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1202,6 +1202,13 @@ DEFAULT_CONFIG = { # class of over-claim that otherwise forces users to run # `git status` to verify edits landed. Set false to suppress. "file_mutation_verifier": True, + # Turn-completion explainer. When true (default), the agent appends a + # one-line explanation to its final response whenever a turn ends + # abnormally with no usable reply — empty content after retries, a + # partial/truncated stream, a still-pending tool result, or an + # iteration/budget limit. Replaces the bare "(empty)" sentinel so the + # failure isn't silent from the UI's perspective. Set false to suppress. + "turn_completion_explainer": True, "show_cost": False, # Show $ cost in the status bar (off by default) "skin": "default", # UI language for static user-facing messages (approval prompts, a diff --git a/run_agent.py b/run_agent.py index a737fbd78bd..88b93a0b28a 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2189,7 +2189,7 @@ class AIAgent: if reason.startswith("text_response"): return "" - prefix = "⚠️ Turn ended without a usable reply: " + prefix = "⚠️ No reply: " if reason == "empty_response_exhausted": return ( prefix diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 0da60572c3e..f5112824a7a 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -3049,7 +3049,7 @@ class TestRunConversation: # #34452: the bare "(empty)" sentinel is now replaced by a # user-visible end-of-turn explanation so the failure isn't silent. assert result["final_response"] != "(empty)" - assert "without a usable reply" in result["final_response"] + assert "No reply:" in result["final_response"] assert result["turn_exit_reason"] == "empty_response_exhausted" assert result["api_calls"] == 6 # 1 original + 2 prefill + 3 retries @@ -3072,7 +3072,7 @@ class TestRunConversation: assert result["completed"] is True # #34452: explanation replaces the bare "(empty)" sentinel. assert result["final_response"] != "(empty)" - assert "without a usable reply" in result["final_response"] + assert "No reply:" in result["final_response"] assert result["api_calls"] == 6 # 1 original + 2 prefill + 3 retries def test_reasoning_only_prefill_succeeds_on_continuation(self, agent): @@ -3121,7 +3121,7 @@ class TestRunConversation: assert result["completed"] is True # #34452: explanation replaces the bare "(empty)" sentinel. assert result["final_response"] != "(empty)" - assert "without a usable reply" in result["final_response"] + assert "No reply:" in result["final_response"] assert result["api_calls"] == 4 # 1 original + 3 retries def test_truly_empty_response_succeeds_on_nudge(self, agent): @@ -3219,7 +3219,7 @@ class TestRunConversation: assert result["completed"] is True # #34452: explanation replaces the bare "(empty)" sentinel. assert result["final_response"] != "(empty)" - assert "without a usable reply" in result["final_response"] + assert "No reply:" in result["final_response"] def test_empty_response_emits_status_for_gateway(self, agent): """_emit_status is called during empty retries so gateway users see feedback.""" @@ -3248,7 +3248,7 @@ class TestRunConversation: # #34452: explanation replaces the bare "(empty)" sentinel, but the # status emissions during retries are unchanged. assert result["final_response"] != "(empty)" - assert "without a usable reply" in result["final_response"] + assert "No reply:" in result["final_response"] # Should have emitted retry statuses (3 retries) + final failure retry_msgs = [m for m in status_messages if "retrying" in m.lower()] assert len(retry_msgs) == 3, f"Expected 3 retry status messages, got {len(retry_msgs)}: {status_messages}" diff --git a/tests/run_agent/test_turn_completion_explainer.py b/tests/run_agent/test_turn_completion_explainer.py index b120272b04a..a04cc1e5e36 100644 --- a/tests/run_agent/test_turn_completion_explainer.py +++ b/tests/run_agent/test_turn_completion_explainer.py @@ -159,7 +159,7 @@ def test_run_conversation_empty_exhausted_surfaces_explanation(): # The user must NOT be left with a bare sentinel; the explanation wins. assert result["final_response"] != "(empty)" assert result["final_response"].strip() != "" - assert "without a usable reply" in result["final_response"] + assert "No reply:" in result["final_response"] def test_run_conversation_normal_reply_stays_quiet(): @@ -178,4 +178,4 @@ def test_run_conversation_normal_reply_stays_quiet(): assert result["turn_exit_reason"].startswith("text_response") assert result["final_response"] == "Done." - assert "without a usable reply" not in result["final_response"] + assert "No reply:" not in result["final_response"] From 860cf28dabbaf93459a778a835edbc3663e381c5 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 19:59:04 -0700 Subject: [PATCH 11/89] docs: clarify compression threshold is derived from the main model's context window (#35099) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compression threshold is threshold × context_length where context_length is the MAIN agent model's window, not the auxiliary/summary model's. On a 262,144-token model at the default 0.50 the threshold is 131,072 — close to a common 128K figure by coincidence of the percentage, which has led to confusion that the auxiliary model's context limit is the trigger. Add a note preempting that misreading and pointing to the separate summary-model-context constraint. --- .../context-compression-and-caching.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/website/docs/developer-guide/context-compression-and-caching.md b/website/docs/developer-guide/context-compression-and-caching.md index 4b511756181..55641b16f27 100644 --- a/website/docs/developer-guide/context-compression-and-caching.md +++ b/website/docs/developer-guide/context-compression-and-caching.md @@ -111,6 +111,17 @@ tail_token_budget = 100,000 × 0.20 = 20,000 max_summary_tokens = min(200,000 × 0.05, 12,000) = 10,000 ``` +:::note Threshold is derived from the MAIN model's context window +`threshold_tokens` is always `threshold × context_length`, where `context_length` +is the **main agent model's** context window — never the auxiliary/summary +model's. On a 262,144-token model at the default `0.50`, the threshold is +`262,144 × 0.50 = 131,072`. That number being close to a common "128K context" +is a coincidence of the percentage, not a sign that the auxiliary model's window +is the trigger. The auxiliary model's context window is a separate concern — see +the "Summary model context length" warning below for how it affects whether a +summary can be produced, not when compression fires. +::: + ## Compression Algorithm From 5ad2b4c6dab78e6e5522c8fc02bcbb89a555f47e Mon Sep 17 00:00:00 2001 From: LeonSGP43 Date: Thu, 16 Apr 2026 18:51:07 +0800 Subject: [PATCH 12/89] fix(session): degrade gracefully when SQLite lacks FTS5 --- hermes_state.py | 23 +++++++++++++++++--- tests/test_hermes_state.py | 43 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/hermes_state.py b/hermes_state.py index 2b6cedeaaf3..7242e6b179c 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -380,6 +380,7 @@ class SessionDB: self._lock = threading.Lock() self._write_count = 0 + self._fts_enabled = False try: self._conn = sqlite3.connect( str(self.db_path), @@ -388,7 +389,6 @@ class SessionDB: # handles contention instead of sitting in SQLite's internal # busy handler for up to 30s. timeout=1.0, - # Autocommit mode: Python's default isolation_level="" # auto-starts transactions on DML, which conflicts with our # explicit BEGIN IMMEDIATE. None = we manage transactions # ourselves. @@ -724,8 +724,22 @@ class SessionDB: # FTS5 setup (separate because CREATE VIRTUAL TABLE can't be in executescript with IF NOT EXISTS reliably) try: cursor.execute("SELECT * FROM messages_fts LIMIT 0") - except sqlite3.OperationalError: - cursor.executescript(FTS_SQL) + self._fts_enabled = True + except sqlite3.OperationalError as exc: + if "no such table" not in str(exc).lower(): + raise + try: + cursor.executescript(FTS_SQL) + self._fts_enabled = True + except sqlite3.OperationalError as fts_exc: + err = str(fts_exc).lower() + if "fts5" not in err and "no such module" not in err: + raise + logger.warning( + "SQLite FTS5 unavailable for %s; full-text search disabled: %s", + self.db_path, + fts_exc, + ) # Trigram FTS5 for CJK/substring search try: @@ -2317,6 +2331,9 @@ class SessionDB: ignores ``sort``. The trigram CJK path honours ``sort`` like the main FTS5 path. """ + if not self._fts_enabled: + return [] + if not query or not query.strip(): return [] diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index cec3c13f0da..d14f065aec9 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -1,11 +1,31 @@ """Tests for hermes_state.py — SessionDB SQLite CRUD, FTS5 search, export.""" +import sqlite3 import time import pytest from hermes_state import SessionDB +class _NoFtsCursor(sqlite3.Cursor): + """Simulate a SQLite build without the fts5 module.""" + + def execute(self, sql, parameters=()): + if sql.strip() == "SELECT * FROM messages_fts LIMIT 0": + raise sqlite3.OperationalError("no such table: messages_fts") + return super().execute(sql, parameters) + + def executescript(self, sql_script): + if "CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts USING fts5" in sql_script: + raise sqlite3.OperationalError("no such module: fts5") + return super().executescript(sql_script) + + +class _NoFtsConnection(sqlite3.Connection): + def cursor(self, factory=None): + return super().cursor(factory or _NoFtsCursor) + + @pytest.fixture() def db(tmp_path): """Create a SessionDB with a temp database file.""" @@ -135,6 +155,29 @@ class TestSessionLifecycle: child = db.get_session("child") assert child["parent_session_id"] == "parent" + def test_db_initializes_without_fts5_module(self, tmp_path, monkeypatch): + real_connect = sqlite3.connect + + def connect_without_fts(*args, **kwargs): + kwargs["factory"] = _NoFtsConnection + return real_connect(*args, **kwargs) + + monkeypatch.setattr("hermes_state.sqlite3.connect", connect_without_fts) + + db = SessionDB(db_path=tmp_path / "state.db") + try: + assert db._fts_enabled is False + + db.create_session(session_id="s1", source="cli") + db.append_message("s1", role="user", content="hello from sqlite without fts") + + messages = db.get_messages("s1") + assert len(messages) == 1 + assert messages[0]["content"] == "hello from sqlite without fts" + assert db.search_messages("hello") == [] + finally: + db.close() + # ========================================================================= # Message storage From 97ecfa0fc487322aa7d0dc38be323eb34fd070ef Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 19:09:01 -0700 Subject: [PATCH 13/89] fix(session): extend no-FTS5 degradation to the trigram CJK index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The salvaged contributor commit guarded only messages_fts. Current main also creates a second virtual table, messages_fts_trigram (CJK substring search), whose CREATE VIRTUAL TABLE ... USING fts5 still raised "no such module: fts5" on builds without FTS5 — re-crashing SessionDB init. Wrap the trigram setup with the same guard, and broaden the test's no-fts5 mock to fail BOTH tables so the regression test actually exercises a faithful no-FTS5 build. --- hermes_state.py | 14 ++++++++++++-- tests/test_hermes_state.py | 14 +++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/hermes_state.py b/hermes_state.py index 7242e6b179c..71a89a2867b 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -744,8 +744,18 @@ class SessionDB: # Trigram FTS5 for CJK/substring search try: cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0") - except sqlite3.OperationalError: - cursor.executescript(FTS_TRIGRAM_SQL) + except sqlite3.OperationalError as exc: + if "no such table" not in str(exc).lower(): + raise + try: + cursor.executescript(FTS_TRIGRAM_SQL) + except sqlite3.OperationalError as fts_exc: + err = str(fts_exc).lower() + if "fts5" not in err and "no such module" not in err: + raise + # Same FTS5-unavailable cause already warned about above for + # messages_fts; the trigram table is an additional CJK index, + # so just degrade silently here. CJK search falls back to LIKE. self._conn.commit() diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index d14f065aec9..a6c33a5cbe8 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -11,12 +11,16 @@ class _NoFtsCursor(sqlite3.Cursor): """Simulate a SQLite build without the fts5 module.""" def execute(self, sql, parameters=()): - if sql.strip() == "SELECT * FROM messages_fts LIMIT 0": - raise sqlite3.OperationalError("no such table: messages_fts") + probe = sql.strip() + if probe in ( + "SELECT * FROM messages_fts LIMIT 0", + "SELECT * FROM messages_fts_trigram LIMIT 0", + ): + raise sqlite3.OperationalError("no such table: " + probe.split()[-3]) return super().execute(sql, parameters) def executescript(self, sql_script): - if "CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts USING fts5" in sql_script: + if "USING fts5" in sql_script: raise sqlite3.OperationalError("no such module: fts5") return super().executescript(sql_script) @@ -167,6 +171,10 @@ class TestSessionLifecycle: db = SessionDB(db_path=tmp_path / "state.db") try: assert db._fts_enabled is False + # Neither FTS5 virtual table should have been created on a build + # that lacks the fts5 module — both init paths must degrade. + assert db._fts_table_exists("messages_fts") is False + assert db._fts_table_exists("messages_fts_trigram") is False db.create_session(session_id="s1", source="cli") db.append_message("s1", role="user", content="hello from sqlite without fts") From 4fa20f9a8bd9b2133cde56cf99516e38195ef4bd Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 19:39:49 -0700 Subject: [PATCH 14/89] fix(install): ensure the uv-managed Python ships SQLite FTS5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uv's python-build-standalone distributions only gained FTS5 in mid-2025 (#694). A stale interpreter already in uv's store — which `uv python find` reuses without checking — can lack it, leaving the supported install with a SQLite that can't create the FTS5 virtual tables hermes_state.py needs for full-text session search ("no such module: fts5"). check_python now probes the resolved interpreter for FTS5 and, if missing, reinstalls the latest patch for $PYTHON_VERSION (which has FTS5) and re-resolves. If an FTS5-capable Python still can't be obtained (offline, pinned env), it warns and continues — Hermes degrades gracefully and only disables session search. No bundled second SQLite, no user action. --- scripts/install.sh | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/scripts/install.sh b/scripts/install.sh index 7d1df04124e..bf96b93c6d0 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -473,6 +473,7 @@ check_python() { if PYTHON_PATH="$("$UV_CMD" python find "$PYTHON_VERSION" 2>/dev/null)"; then PYTHON_FOUND_VERSION="$("$PYTHON_PATH" --version 2>/dev/null)" log_success "Python found: $PYTHON_FOUND_VERSION" + ensure_fts5 return 0 fi @@ -482,6 +483,7 @@ check_python() { PYTHON_PATH="$("$UV_CMD" python find "$PYTHON_VERSION")" PYTHON_FOUND_VERSION="$("$PYTHON_PATH" --version 2>/dev/null)" log_success "Python installed: $PYTHON_FOUND_VERSION" + ensure_fts5 else log_error "Failed to install Python $PYTHON_VERSION" log_info "Install Python $PYTHON_VERSION manually, then re-run this script" @@ -489,6 +491,51 @@ check_python() { fi } +# Probe whether $1 (a python executable) links a SQLite with the FTS5 +# module compiled in. Hermes' session store (hermes_state.py) creates FTS5 +# virtual tables for full-text session search; a SQLite without FTS5 makes +# the bundled-python path unusable for that feature. Returns 0 if FTS5 works. +_python_has_fts5() { + "$1" - <<'PY' 2>/dev/null +import sqlite3, sys +try: + sqlite3.connect(":memory:").execute("CREATE VIRTUAL TABLE t USING fts5(x)") +except Exception: + sys.exit(1) +PY +} + +# Guarantee the resolved uv-managed interpreter ships FTS5. uv's Python +# distributions only gained FTS5 in mid-2025 (python-build-standalone #694), +# so a stale interpreter already in uv's store — which `uv python find` +# happily reuses — can lack it. When that happens, force a reinstall of the +# latest patch for $PYTHON_VERSION (which has FTS5) and re-resolve. This keeps +# the supported install path's session search working without bundling a +# second SQLite or asking the user to do anything. +ensure_fts5() { + [ -n "${PYTHON_PATH:-}" ] || return 0 + if _python_has_fts5 "$PYTHON_PATH"; then + return 0 + fi + + log_warn "Resolved Python's SQLite lacks the FTS5 module (session search needs it)." + log_info "Reinstalling a current Python $PYTHON_VERSION with FTS5 via uv..." + if "$UV_CMD" python install "$PYTHON_VERSION" --reinstall >/dev/null 2>&1; then + PYTHON_PATH="$("$UV_CMD" python find "$PYTHON_VERSION" 2>/dev/null)" + PYTHON_FOUND_VERSION="$("$PYTHON_PATH" --version 2>/dev/null)" + fi + + if [ -n "${PYTHON_PATH:-}" ] && _python_has_fts5 "$PYTHON_PATH"; then + log_success "FTS5 available ($PYTHON_FOUND_VERSION)" + else + # Could not obtain an FTS5-capable interpreter (offline, pinned env, + # etc.). Install proceeds — Hermes degrades gracefully and disables + # only full-text session search — but warn so it isn't a silent gap. + log_warn "Could not obtain an FTS5-capable Python. Hermes will run, but" + log_warn "full-text session search will be disabled until FTS5 is present." + fi +} + check_git() { log_info "Checking Git..." From a7421dc7d2f0659a016092db6fc154526c8734b3 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 19:52:09 -0700 Subject: [PATCH 15/89] fix(session): point no-FTS5 warning at the supported install When FTS5 is missing the warning now explains the likely cause (an unsupported / pip-managed Python whose bundled SQLite lacks FTS5) and links the supported install at hermes-agent.nousresearch.com, instead of just logging the raw error. --- hermes_state.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hermes_state.py b/hermes_state.py index 71a89a2867b..19f20763244 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -736,7 +736,13 @@ class SessionDB: if "fts5" not in err and "no such module" not in err: raise logger.warning( - "SQLite FTS5 unavailable for %s; full-text search disabled: %s", + "SQLite FTS5 unavailable for %s; full-text session search " + "disabled. This usually means Hermes is running on an " + "unsupported install (e.g. a pip-installed or pip-managed " + "Python whose bundled SQLite lacks FTS5) rather than a " + "mainline install. Some features may be missing or behave " + "differently. Install the supported way: " + "https://hermes-agent.nousresearch.com (underlying error: %s)", self.db_path, fts_exc, ) From aa32edcac5ee3c3359f2bf8ba2aa372f40787975 Mon Sep 17 00:00:00 2001 From: Siddharth Balyan <52913345+alt-glitch@users.noreply.github.com> Date: Sat, 30 May 2026 09:15:12 +0530 Subject: [PATCH 16/89] fix(setup): write config for image_gen and video_gen in apply_nous_managed_defaults (#35109) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apply_nous_managed_defaults() was adding image_gen and video_gen to the 'changed' return set without writing any config values. The caller (tools_command first_install flow) uses 'changed' to skip manual configuration, so these tools ended up in platform_toolsets but with no video_gen.provider, video_gen.use_gateway, or image_gen.use_gateway in config.yaml. At runtime the FAL plugin's is_available() returned False because there was no FAL_KEY and no use_gateway config — the tool never loaded despite being 'enabled' in the toolset list. For image_gen this was a latent bug masked by the gateway offer prompt (prompt_enable_tool_gateway) running earlier in the setup flow and writing image_gen.use_gateway=True via apply_gateway_defaults(). But if the user skipped the gateway offer, image_gen would silently break the same way. For video_gen (added in PR #33259) the bug was always hit because the gateway offer ran before the user checked video_gen in the toolset checklist. Fix: write provider/use_gateway config values before adding to 'changed', matching the pattern used by web, tts, and browser. --- hermes_cli/nous_subscription.py | 11 +++ tests/hermes_cli/test_nous_subscription.py | 90 ++++++++++++++++++++++ tests/hermes_cli/test_tools_config.py | 60 +++++++++++++++ tests/honcho_plugin/test_async_memory.py | 5 +- 4 files changed, 165 insertions(+), 1 deletion(-) diff --git a/hermes_cli/nous_subscription.py b/hermes_cli/nous_subscription.py index 5f29101eb01..f19393337bd 100644 --- a/hermes_cli/nous_subscription.py +++ b/hermes_cli/nous_subscription.py @@ -587,9 +587,20 @@ def apply_nous_managed_defaults( changed.add("browser") if "image_gen" in selected_toolsets and not fal_key_is_configured(): + image_cfg = config.get("image_gen") + if not isinstance(image_cfg, dict): + image_cfg = {} + config["image_gen"] = image_cfg + image_cfg["use_gateway"] = True changed.add("image_gen") if "video_gen" in selected_toolsets and not fal_key_is_configured(): + video_cfg = config.get("video_gen") + if not isinstance(video_cfg, dict): + video_cfg = {} + config["video_gen"] = video_cfg + video_cfg["provider"] = "fal" + video_cfg["use_gateway"] = True changed.add("video_gen") return changed diff --git a/tests/hermes_cli/test_nous_subscription.py b/tests/hermes_cli/test_nous_subscription.py index 2c89d245301..561602c0ac6 100644 --- a/tests/hermes_cli/test_nous_subscription.py +++ b/tests/hermes_cli/test_nous_subscription.py @@ -231,3 +231,93 @@ def test_get_gateway_eligible_tools_ignores_quoted_false_opt_in(monkeypatch): assert "web" in has_direct assert "web" not in already_managed assert set(unconfigured) == {"image_gen", "video_gen", "tts", "browser"} + + +def test_apply_nous_managed_defaults_writes_video_gen_config(monkeypatch): + """apply_nous_managed_defaults must write video_gen.provider and + video_gen.use_gateway when a Nous subscriber selects video_gen + without a direct FAL_KEY.""" + monkeypatch.setattr(ns, "managed_nous_tools_enabled", lambda **kw: True) + monkeypatch.delenv("FAL_KEY", raising=False) + monkeypatch.setattr(ns, "fal_key_is_configured", lambda: False) + monkeypatch.setattr( + ns, "get_nous_portal_account_info", + lambda **kw: _account(logged_in=True, paid=True), + ) + + config = {"model": {"provider": "nous"}} + changed = ns.apply_nous_managed_defaults( + config, enabled_toolsets=["video_gen"], + ) + + assert "video_gen" in changed + assert config["video_gen"]["provider"] == "fal" + assert config["video_gen"]["use_gateway"] is True + + +def test_apply_nous_managed_defaults_writes_image_gen_config(monkeypatch): + """apply_nous_managed_defaults must write image_gen.use_gateway + when a Nous subscriber selects image_gen without a direct FAL_KEY.""" + monkeypatch.setattr(ns, "managed_nous_tools_enabled", lambda **kw: True) + monkeypatch.delenv("FAL_KEY", raising=False) + monkeypatch.setattr(ns, "fal_key_is_configured", lambda: False) + monkeypatch.setattr( + ns, "get_nous_portal_account_info", + lambda **kw: _account(logged_in=True, paid=True), + ) + + config = {"model": {"provider": "nous"}} + changed = ns.apply_nous_managed_defaults( + config, enabled_toolsets=["image_gen"], + ) + + assert "image_gen" in changed + assert config["image_gen"]["use_gateway"] is True + + +def test_apply_nous_managed_defaults_skips_fal_tools_when_key_present(monkeypatch): + """When FAL_KEY is set, apply_nous_managed_defaults should not touch + image_gen or video_gen config — the user's direct key takes precedence.""" + monkeypatch.setattr(ns, "managed_nous_tools_enabled", lambda **kw: True) + monkeypatch.setenv("FAL_KEY", "fal-direct-key") + monkeypatch.setattr(ns, "fal_key_is_configured", lambda: True) + monkeypatch.setattr( + ns, "get_nous_portal_account_info", + lambda **kw: _account(logged_in=True, paid=True), + ) + + config = {"model": {"provider": "nous"}} + changed = ns.apply_nous_managed_defaults( + config, enabled_toolsets=["image_gen", "video_gen"], + ) + + assert "image_gen" not in changed + assert "video_gen" not in changed + assert "image_gen" not in config + assert "video_gen" not in config + + +def test_apply_nous_managed_defaults_preserves_existing_video_gen_section(monkeypatch): + """When video_gen config already exists as a dict, the function should + update it in-place rather than replacing it.""" + monkeypatch.setattr(ns, "managed_nous_tools_enabled", lambda **kw: True) + monkeypatch.delenv("FAL_KEY", raising=False) + monkeypatch.setattr(ns, "fal_key_is_configured", lambda: False) + monkeypatch.setattr( + ns, "get_nous_portal_account_info", + lambda **kw: _account(logged_in=True, paid=True), + ) + + config = { + "model": {"provider": "nous"}, + "video_gen": {"model": "pixverse-v6"}, + } + changed = ns.apply_nous_managed_defaults( + config, enabled_toolsets=["video_gen"], + ) + + assert "video_gen" in changed + assert config["video_gen"]["provider"] == "fal" + assert config["video_gen"]["use_gateway"] is True + # Pre-existing keys should be preserved + assert config["video_gen"]["model"] == "pixverse-v6" diff --git a/tests/hermes_cli/test_tools_config.py b/tests/hermes_cli/test_tools_config.py index cfef9c3b46a..e93ad8fcaf3 100644 --- a/tests/hermes_cli/test_tools_config.py +++ b/tests/hermes_cli/test_tools_config.py @@ -757,8 +757,68 @@ def test_first_install_nous_auto_configures_managed_defaults(monkeypatch): assert config["web"]["backend"] == "firecrawl" assert config["tts"]["provider"] == "openai" assert config["browser"]["cloud_provider"] == "browser-use" + assert config["image_gen"]["use_gateway"] is True assert configured == [] + +def test_first_install_nous_auto_configures_video_gen(monkeypatch): + """When a Nous subscriber checks video_gen in the toolset checklist, + apply_nous_managed_defaults must write video_gen.provider and + video_gen.use_gateway so the FAL plugin can route through the gateway + at runtime. Regression test for the bug where video_gen was marked as + auto-configured but no config was actually written.""" + monkeypatch.setattr("hermes_cli.nous_subscription.managed_nous_tools_enabled", lambda: True) + config = { + "model": {"provider": "nous"}, + "platform_toolsets": {"cli": []}, + } + for env_var in ( + "VOICE_TOOLS_OPENAI_KEY", + "OPENAI_API_KEY", + "ELEVENLABS_API_KEY", + "FIRECRAWL_API_KEY", + "FIRECRAWL_API_URL", + "TAVILY_API_KEY", + "PARALLEL_API_KEY", + "BROWSERBASE_API_KEY", + "BROWSERBASE_PROJECT_ID", + "BROWSER_USE_API_KEY", + "FAL_KEY", + ): + monkeypatch.delenv(env_var, raising=False) + + monkeypatch.setattr( + "hermes_cli.tools_config._prompt_toolset_checklist", + lambda *args, **kwargs: {"video_gen"}, + ) + monkeypatch.setattr("hermes_cli.tools_config.save_config", lambda config: None) + monkeypatch.setattr( + "hermes_cli.tools_config._get_enabled_platforms", + lambda: ["cli"], + ) + monkeypatch.setattr( + "hermes_cli.nous_subscription.get_nous_portal_account_info", + lambda *args, **kwargs: NousPortalAccountInfo( + logged_in=True, + source="jwt", + fresh=False, + paid_service_access=True, + ), + ) + + configured = [] + monkeypatch.setattr( + "hermes_cli.tools_config._configure_toolset", + lambda ts_key, config: configured.append(ts_key), + ) + + tools_command(first_install=True, config=config) + + assert config["video_gen"]["provider"] == "fal" + assert config["video_gen"]["use_gateway"] is True + # video_gen should NOT appear in the manual configure list — it's auto-configured + assert "video_gen" not in configured + # ── Platform / toolset consistency ──────────────────────────────────────────── diff --git a/tests/honcho_plugin/test_async_memory.py b/tests/honcho_plugin/test_async_memory.py index 97f4f7306d5..e1f2f5ea97b 100644 --- a/tests/honcho_plugin/test_async_memory.py +++ b/tests/honcho_plugin/test_async_memory.py @@ -249,9 +249,12 @@ class TestFlushAll: mgr = _make_manager(write_frequency="async") sess = _make_session() sess.add_message("user", "pending") - mgr._async_queue.put(sess) with patch.object(mgr, "_flush_session") as mock_flush: + # Put the item AFTER the mock is installed so the background + # writer thread (if it dequeues before flush_all) still hits + # the mock rather than the real _flush_session. + mgr._async_queue.put(sess) mgr.flush_all() # Called at least once for the queued item assert mock_flush.call_count >= 1 From 827ce602dbed199f665f3975b61303aace2963ea Mon Sep 17 00:00:00 2001 From: Erosika Date: Sat, 30 May 2026 10:54:53 +0530 Subject: [PATCH 17/89] fix(honcho): harden self-hosted setup paths Self-hosted Honcho setup had four sharp edges: - local/cloud URLs ending in /vN double-prefixed by the SDK (/v3/v3/... 404) - authenticated local servers had no setup prompt for a JWT/bearer token - profile-derived host keys could be dot-containing workspace IDs Honcho rejects - memory-provider config files with API keys written world-readable per umask This keeps existing behavior but makes those paths safer: - strip a trailing /vN version segment from any configured baseUrl before SDK init (the SDK's route builders always prepend their own version prefix); auth-skipping stays loopback-only - add an optional local JWT/bearer prompt in honcho setup, stored under hosts..apiKey - derive new profile host keys with underscores, still reading legacy hermes. blocks - write memory-provider config files atomically with 0600 via a shared utils.atomic_json_write(mode=) arg (honcho/hindsight/mem0/supermemory) - skip honcho.json parsing in gateway cache-busting unless Honcho is the active memory provider; memoize by honcho.json mtime when active - bust the gateway agent cache on memory.provider change - add a hermes memory setup one-liner so fresh installs can configure a named provider without the picker (the per-provider hermes subcommand only registers once that provider is active) Closes #20688, #29885, #26459, #30246, #33382, #32244. Co-authored-by: BROCCOLO1D --- gateway/run.py | 70 ++++-- hermes_cli/main.py | 8 +- hermes_cli/memory_setup.py | 6 +- hermes_cli/profiles.py | 21 +- .../autonomous-ai-agents/honcho/SKILL.md | 4 +- plugins/memory/hindsight/__init__.py | 3 +- plugins/memory/honcho/README.md | 21 +- plugins/memory/honcho/__init__.py | 4 +- plugins/memory/honcho/cli.py | 71 ++++-- plugins/memory/honcho/client.py | 40 +++- plugins/memory/mem0/__init__.py | 3 +- plugins/memory/supermemory/__init__.py | 3 +- tests/gateway/test_agent_cache.py | 105 +++++++++ .../test_memory_setup_provider_arg.py | 50 ++++ tests/hermes_cli/test_profiles.py | 12 +- tests/honcho_plugin/test_cli.py | 89 ++++++- tests/honcho_plugin/test_client.py | 218 +++++++++++++++++- tests/honcho_plugin/test_pin_peer_name.py | 16 +- .../plugins/memory/test_hindsight_provider.py | 12 + tests/plugins/memory/test_mem0_v2.py | 15 ++ .../memory/test_supermemory_provider.py | 12 + tests/test_honcho_client_config.py | 25 ++ utils.py | 16 +- website/docs/user-guide/features/honcho.md | 9 +- .../user-guide/features/memory-providers.md | 2 +- 25 files changed, 734 insertions(+), 101 deletions(-) create mode 100644 tests/hermes_cli/test_memory_setup_provider_arg.py diff --git a/gateway/run.py b/gateway/run.py index 5cdc5894cf4..1b2220a561c 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -15331,8 +15331,52 @@ class GatewayRunner: ("compression", "target_ratio"), ("compression", "protect_last_n"), ("agent", "disabled_toolsets"), + ("memory", "provider"), ) + _HONCHO_CACHE_BUSTING_KEYS = ( + "honcho.peer_name", + "honcho.ai_peer", + "honcho.pin_peer_name", + "honcho.runtime_peer_prefix", + "honcho.user_peer_aliases", + ) + _HONCHO_CACHE_BUSTING_MEMO: dict[tuple[str, int | None], dict[str, Any]] = {} + + @classmethod + def _empty_honcho_cache_busting_config(cls) -> dict[str, Any]: + return {key: None for key in cls._HONCHO_CACHE_BUSTING_KEYS} + + @classmethod + def _extract_honcho_cache_busting_config(cls) -> dict[str, Any]: + """Extract Honcho identity keys, memoized by honcho.json mtime.""" + try: + from plugins.memory.honcho.client import HonchoClientConfig, resolve_config_path + + path = resolve_config_path() + try: + mtime_ns = path.stat().st_mtime_ns + except OSError: + mtime_ns = None + memo_key = (str(path), mtime_ns) + cached = cls._HONCHO_CACHE_BUSTING_MEMO.get(memo_key) + if cached is not None: + return dict(cached) + + hcfg = HonchoClientConfig.from_global_config(config_path=path) + aliases = hcfg.user_peer_aliases or {} + values = { + "honcho.peer_name": hcfg.peer_name, + "honcho.ai_peer": hcfg.ai_peer, + "honcho.pin_peer_name": bool(hcfg.pin_peer_name), + "honcho.runtime_peer_prefix": hcfg.runtime_peer_prefix or "", + "honcho.user_peer_aliases": sorted(aliases.items()) if isinstance(aliases, dict) else [], + } + cls._HONCHO_CACHE_BUSTING_MEMO = {memo_key: values} + return dict(values) + except Exception: + return cls._empty_honcho_cache_busting_config() + @classmethod def _extract_cache_busting_config(cls, user_config: dict | None) -> dict: """Pull values that must bust the cached agent. @@ -15363,26 +15407,12 @@ class GatewayRunner: out["tools.registry_generation"] = None # Honcho identity-mapping keys live in honcho.json, not user_config. - # HonchoSessionManager freezes the resolved peer_name / ai_peer / - # pin / aliases / prefix at construction; without busting here, - # mid-flight honcho.json edits go unread until the next unrelated - # cache eviction. - try: - from plugins.memory.honcho.client import HonchoClientConfig - - hcfg = HonchoClientConfig.from_global_config() - out["honcho.peer_name"] = hcfg.peer_name - out["honcho.ai_peer"] = hcfg.ai_peer - out["honcho.pin_peer_name"] = bool(hcfg.pin_peer_name) - out["honcho.runtime_peer_prefix"] = hcfg.runtime_peer_prefix or "" - aliases = hcfg.user_peer_aliases or {} - out["honcho.user_peer_aliases"] = sorted(aliases.items()) if isinstance(aliases, dict) else [] - except Exception: - out["honcho.peer_name"] = None - out["honcho.ai_peer"] = None - out["honcho.pin_peer_name"] = None - out["honcho.runtime_peer_prefix"] = None - out["honcho.user_peer_aliases"] = None + # Only read that file when Honcho is the active memory provider. + provider = cfg_get(cfg, "memory", "provider") + if isinstance(provider, str) and provider.lower() == "honcho": + out.update(cls._extract_honcho_cache_busting_config()) + else: + out.update(cls._empty_honcho_cache_busting_config()) return out diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 76bd12a53e1..79dd50c23b0 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -13029,9 +13029,15 @@ Examples: ), ) memory_sub = memory_parser.add_subparsers(dest="memory_command") - memory_sub.add_parser( + _setup_parser = memory_sub.add_parser( "setup", help="Interactive provider selection and configuration" ) + _setup_parser.add_argument( + "provider", + nargs="?", + default=None, + help="Provider to configure directly (e.g. honcho), skipping the picker", + ) memory_sub.add_parser("status", help="Show current memory provider config") memory_sub.add_parser("off", help="Disable external provider (built-in only)") _reset_parser = memory_sub.add_parser( diff --git a/hermes_cli/memory_setup.py b/hermes_cli/memory_setup.py index cac13bf781d..a75c10b0229 100644 --- a/hermes_cli/memory_setup.py +++ b/hermes_cli/memory_setup.py @@ -452,7 +452,11 @@ def memory_command(args) -> None: """Route memory subcommands.""" sub = getattr(args, "memory_command", None) if sub == "setup": - cmd_setup(args) + provider = getattr(args, "provider", None) + if provider: + cmd_setup_provider(provider) + else: + cmd_setup(args) elif sub == "status": cmd_status(args) else: diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index f490cbbfb99..31dbf8dfb4a 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -1471,8 +1471,9 @@ def import_profile(archive_path: str, name: Optional[str] = None) -> Path: def _migrate_honcho_profile_host(old_name: str, new_name: str, new_dir: Path) -> None: """Rename Honcho host blocks for a renamed profile without changing peers.""" - old_host = f"hermes.{old_name}" - new_host = f"hermes.{new_name}" + old_host = f"hermes_{old_name}" + legacy_old_host = f"hermes.{old_name}" + new_host = f"hermes_{new_name}" candidates = [ new_dir / "honcho.json", @@ -1496,18 +1497,24 @@ def _migrate_honcho_profile_host(old_name: str, new_name: str, new_dir: Path) -> continue hosts = raw.get("hosts") - if not isinstance(hosts, dict) or old_host not in hosts: + if not isinstance(hosts, dict): + continue + source_host = old_host if old_host in hosts else legacy_old_host + if source_host not in hosts: continue if new_host in hosts: print(f"⚠ Honcho host block not migrated: {new_host} already exists in {path}") continue - block = hosts[old_host] + block = hosts[source_host] if isinstance(block, dict) and "aiPeer" not in block: - bare = old_host.split(".", 1)[1] if "." in old_host else old_host + if source_host.startswith("hermes_"): + bare = source_host.split("_", 1)[1] + else: + bare = source_host.split(".", 1)[1] if "." in source_host else source_host block["aiPeer"] = bare - hosts[new_host] = hosts.pop(old_host) + hosts[new_host] = hosts.pop(source_host) tmp = path.with_suffix(path.suffix + ".tmp") try: tmp.write_text(json.dumps(raw, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") @@ -1519,7 +1526,7 @@ def _migrate_honcho_profile_host(old_name: str, new_name: str, new_dir: Path) -> pass continue - print(f"✓ Honcho host updated: {old_host} → {new_host}") + print(f"✓ Honcho host updated: {source_host} → {new_host}") def rename_profile(old_name: str, new_name: str) -> Path: diff --git a/optional-skills/autonomous-ai-agents/honcho/SKILL.md b/optional-skills/autonomous-ai-agents/honcho/SKILL.md index 865d844df26..b4a24a46e25 100644 --- a/optional-skills/autonomous-ai-agents/honcho/SKILL.md +++ b/optional-skills/autonomous-ai-agents/honcho/SKILL.md @@ -32,14 +32,14 @@ Honcho provides AI-native cross-session user modeling. It learns who the user is ### Cloud (app.honcho.dev) ```bash -hermes honcho setup +hermes memory setup honcho # select "cloud", paste API key from https://app.honcho.dev ``` ### Self-hosted ```bash -hermes honcho setup +hermes memory setup honcho # select "local", enter base URL (e.g. http://localhost:8000) ``` diff --git a/plugins/memory/hindsight/__init__.py b/plugins/memory/hindsight/__init__.py index ef8fcafb88a..2f94c08da38 100644 --- a/plugins/memory/hindsight/__init__.py +++ b/plugins/memory/hindsight/__init__.py @@ -633,7 +633,8 @@ class HindsightMemoryProvider(MemoryProvider): except Exception: pass existing.update(values) - config_path.write_text(json.dumps(existing, indent=2)) + from utils import atomic_json_write + atomic_json_write(config_path, existing, mode=0o600) def post_setup(self, hermes_home: str, config: dict) -> None: """Custom setup wizard — installs only the deps needed for the selected mode.""" diff --git a/plugins/memory/honcho/README.md b/plugins/memory/honcho/README.md index dbe3eebc9a5..3774747d05a 100644 --- a/plugins/memory/honcho/README.md +++ b/plugins/memory/honcho/README.md @@ -12,8 +12,8 @@ AI-native cross-session user modeling with multi-pass dialectic reasoning, sessi ## Setup ```bash -hermes honcho setup # full interactive wizard (cloud or local) -hermes memory setup # generic picker, also works +hermes memory setup honcho # configure Honcho directly (works on a fresh install) +hermes memory setup # generic picker, choose Honcho from the list ``` Or manually: @@ -22,6 +22,10 @@ hermes config set memory.provider honcho echo "HONCHO_API_KEY=***" >> ~/.hermes/.env ``` +> `hermes honcho setup` also works, but only **after** Honcho is the active +> memory provider — the `honcho` subcommand is registered for the active +> provider only. On a fresh install, use `hermes memory setup honcho`. + ## Architecture Overview ### Two-Layer Context Injection @@ -109,7 +113,7 @@ Config is read from the first file that exists: | 2 | `~/.hermes/honcho.json` | Default profile (shared host blocks) | | 3 | `~/.honcho/config.json` | Global (cross-app interop) | -Host key is derived from the active Hermes profile: `hermes` (default) or `hermes.`. +Host key is derived from the active Hermes profile: `hermes` (default) or `hermes_`. For every key, resolution order is: **host block > root > env var > default**. @@ -154,7 +158,7 @@ In gateway deployments (Telegram, Discord, Slack, etc.) each user arrives with a **Host vs root semantics.** All three keys are accepted at both root and `hosts.` levels. Host-level wins. For maps and prefixes, host-level *replaces* the root value as a whole (not merge), so a host can intentionally own its identity universe or wipe it with `userPeerAliases: {}` / `runtimePeerPrefix: ""`. -**Deployment shapes** (`hermes honcho setup` asks one prompt to set these): +**Deployment shapes** (`hermes memory setup honcho` asks one prompt to set these): - **Single-operator** — `pinUserPeer: true`. All gateway users → `peerName`. Recommended for personal use where you connect Hermes to your own Telegram/Discord/etc. - **Multi-user gateway** — `pinUserPeer: false`, optional `runtimePeerPrefix`. Each runtime user → own peer. Recommended for bots serving many humans. @@ -225,7 +229,7 @@ Multiple Hermes profiles can share one workspace while maintaining separate AI i "recallMode": "hybrid", "sessionStrategy": "per-directory" }, - "hermes.coder": { + "hermes_coder": { "aiPeer": "coder", "recallMode": "tools", "sessionStrategy": "per-repo" @@ -236,7 +240,7 @@ Multiple Hermes profiles can share one workspace while maintaining separate AI i Both profiles see the same user (`yourname`) in the same shared environment (`hermes`), but each AI peer builds its own observations, conclusions, and behavior patterns. The coder's memory stays code-oriented; the main agent's stays broad. -Host key is derived from the active Hermes profile: `hermes` (default) or `hermes.` (e.g. `hermes -p coder` → host key `hermes.coder`). +Host key is derived from the active Hermes profile: `hermes` (default) or `hermes_` (e.g. `hermes -p coder` -> host key `hermes_coder`). Older `hermes.` host blocks are still read for compatibility and are migrated when the CLI writes profile-scoped Honcho config. ### Dialectic & Reasoning @@ -307,7 +311,8 @@ Presets: | Command | Description | |---------|-------------| -| `hermes honcho setup` | Full interactive setup wizard | +| `hermes memory setup honcho` | Configure Honcho directly — works on a fresh install | +| `hermes honcho setup` | Interactive setup wizard (only registered once Honcho is the active provider; redirects to `hermes memory setup`) | | `hermes honcho status` | Show resolved config for active profile | | `hermes honcho enable` / `disable` | Toggle Honcho for active profile | | `hermes honcho mode ` | Change recall or observation mode | @@ -344,7 +349,7 @@ Presets: "dialecticMaxChars": 600, "saveMessages": true }, - "hermes.coder": { + "hermes_coder": { "enabled": true, "aiPeer": "coder", "sessionStrategy": "per-repo", diff --git a/plugins/memory/honcho/__init__.py b/plugins/memory/honcho/__init__.py index bbff0d0e628..6e6f39b8cd7 100644 --- a/plugins/memory/honcho/__init__.py +++ b/plugins/memory/honcho/__init__.py @@ -249,6 +249,7 @@ class HonchoMemoryProvider(MemoryProvider): def save_config(self, values, hermes_home): """Write config to $HERMES_HOME/honcho.json (Honcho SDK native format).""" import json + import os from pathlib import Path config_path = Path(hermes_home) / "honcho.json" existing = {} @@ -258,7 +259,8 @@ class HonchoMemoryProvider(MemoryProvider): except Exception: pass existing.update(values) - config_path.write_text(json.dumps(existing, indent=2)) + from utils import atomic_json_write + atomic_json_write(config_path, existing, mode=0o600) def get_config_schema(self): return [ diff --git a/plugins/memory/honcho/cli.py b/plugins/memory/honcho/cli.py index 9227bf95ab8..ce2af8a08b2 100644 --- a/plugins/memory/honcho/cli.py +++ b/plugins/memory/honcho/cli.py @@ -11,7 +11,7 @@ import sys from pathlib import Path from hermes_constants import get_hermes_home -from plugins.memory.honcho.client import resolve_active_host, resolve_config_path, HOST +from plugins.memory.honcho.client import _host_block, profile_host_key, resolve_active_host, resolve_config_path, HOST from hermes_cli.config import cfg_get @@ -36,7 +36,7 @@ def clone_honcho_for_profile(profile_name: str) -> bool: if not default_block and not has_key: return False - new_host = f"{HOST}.{profile_name}" + new_host = profile_host_key(profile_name) if new_host in hosts: return False # already exists @@ -192,7 +192,7 @@ def cmd_sync(args) -> None: if p.name == "default": continue if clone_honcho_for_profile(p.name): - print(f" + {p.name} -> hermes.{p.name}") + print(f" + {p.name} -> {profile_host_key(p.name)}") created += 1 else: skipped += 1 @@ -243,7 +243,7 @@ def _host_key() -> str: if _profile_override: if _profile_override in {"default", "custom"}: return HOST - return f"{HOST}.{_profile_override}" + return profile_host_key(_profile_override) return resolve_active_host() @@ -275,10 +275,8 @@ def _read_config() -> dict: def _write_config(cfg: dict, path: Path | None = None) -> None: path = path or _local_config_path() path.parent.mkdir(parents=True, exist_ok=True) - path.write_text( - json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", - encoding="utf-8", - ) + from utils import atomic_json_write + atomic_json_write(path, cfg, mode=0o600) def _resolve_api_key(cfg: dict) -> str: @@ -292,7 +290,7 @@ def _resolve_api_key(cfg: dict) -> str: config shapes, e.g. ``localhost:8000``) still pass — the Honcho SDK will reject them itself with a clearer error than ours. """ - host_key = ((cfg.get("hosts") or {}).get(_host_key()) or {}).get("apiKey") + host_key = _host_block(cfg, _host_key()).get("apiKey") key = host_key or cfg.get("apiKey", "") or os.environ.get("HONCHO_API_KEY", "") if not key: base_url = cfg.get("baseUrl") or cfg.get("base_url") or os.environ.get("HONCHO_BASE_URL", "") @@ -462,21 +460,58 @@ def cmd_setup(args) -> None: cfg.pop("base_url", None) if is_local: - # --- Local: ask for base URL, skip or clear API key --- + # --- Local: ask for base URL, optionally accept a JWT for auth --- current_url = cfg.get("baseUrl") or "" new_url = _prompt("Base URL", default=current_url or "http://localhost:8000") if new_url: cfg["baseUrl"] = new_url - # For local no-auth, the SDK must not send an API key. - # We keep the key in config (for cloud switching later) but - # the client should skip auth when baseUrl is local. - current_key = cfg.get("apiKey", "") - if current_key: - print(f"\n API key present in config (kept for cloud/hybrid use).") - print(" Local connections will skip auth automatically.") + # Self-hosted Honcho can run with AUTH_USE_AUTH=true and an + # AUTH_JWT_SECRET on the server side. In that case clients must + # send a JWT signed with that secret as the bearer token (the + # Honcho SDK takes it via ``api_key=``). Cloud users got prompted + # for a key already; the local path historically skipped this and + # forced users to disable auth on the server. Offer the prompt + # here too. We store it under the host block (not the top-level + # apiKey) so ``get_honcho_client`` recognises it as an explicit + # local auth opt-in (see ``_host_has_key`` in client.py) and + # cloud/hybrid switching is unaffected. + current_host_key = hermes_host.get("apiKey", "") + masked = ( + f"...{current_host_key[-8:]}" + if len(current_host_key) > 8 + else ("set" if current_host_key else "not set") + ) + print( + "\n Local Honcho auth (JWT signed with the server's " + "AUTH_JWT_SECRET)." + ) + print( + " Leave blank if your server runs with AUTH_USE_AUTH=false. " + f"Current: {masked}" + ) + new_local_key = _prompt( + "Local JWT / bearer token (blank to skip / keep current)", + secret=True, + ) + if new_local_key: + hermes_host["apiKey"] = new_local_key + elif current_host_key: + print(" Keeping existing local JWT.") else: - print("\n No API key set. Local no-auth ready.") + # Surface the top-level key situation for transparency. + top_key = cfg.get("apiKey", "") + if top_key: + print( + "\n Top-level API key present in config (kept for " + "cloud/hybrid use)." + ) + print( + " Local connections will skip auth automatically " + "until a local JWT is set above." + ) + else: + print("\n No local JWT set. Local no-auth ready.") else: # --- Cloud: set default base URL, require API key --- cfg.pop("baseUrl", None) # cloud uses SDK default diff --git a/plugins/memory/honcho/client.py b/plugins/memory/honcho/client.py index 3d31bd7a1fb..ae837a0b115 100644 --- a/plugins/memory/honcho/client.py +++ b/plugins/memory/honcho/client.py @@ -32,6 +32,24 @@ logger = logging.getLogger(__name__) HOST = "hermes" +def profile_host_key(profile: str | None) -> str: + """Return the safe Honcho host key for a Hermes profile.""" + if not profile or profile in {"default", "custom"}: + return HOST + sanitized = "".join(c if c.isalnum() or c in "_-" else "_" for c in profile).strip("_") + return f"{HOST}_{sanitized or 'profile'}" + + +def _host_block(raw: dict, host: str) -> dict: + """Return host config, accepting legacy dot-form profile host keys.""" + hosts = raw.get("hosts") or {} + block = hosts.get(host, {}) + if block or not host.startswith(f"{HOST}_"): + return block + legacy = f"{HOST}.{host[len(HOST) + 1:]}" + return hosts.get(legacy, {}) + + def resolve_active_host() -> str: """Derive the Honcho host key from the active Hermes profile. @@ -47,8 +65,7 @@ def resolve_active_host() -> str: try: from hermes_cli.profiles import get_active_profile_name profile = get_active_profile_name() - if profile and profile not in {"default", "custom"}: - return f"{HOST}.{profile}" + return profile_host_key(profile) except Exception: pass return HOST @@ -406,7 +423,7 @@ class HonchoClientConfig: logger.warning("Failed to read %s: %s, falling back to env", path, e) return cls.from_env(host=resolved_host) - host_block = (raw.get("hosts") or {}).get(resolved_host, {}) + host_block = _host_block(raw, resolved_host) # A hosts.hermes block or explicit enabled flag means the user # intentionally configured Honcho for this host. _explicitly_configured = bool(host_block) or raw.get("enabled") is True @@ -811,7 +828,10 @@ def get_honcho_client(config: HonchoClientConfig | None = None) -> Honcho: or "::1" in resolved_base_url ) if _is_local: - # Check if the host block has its own apiKey (explicit local auth) + # Check if the host block has its own apiKey (explicit local auth). + # Auth-skipping is loopback-only: a stored key is likely a cloud key + # that would break a no-auth local server, so we substitute the SDK's + # required-non-empty placeholder unless the host block opts in. _raw = config.raw or {} _host_block = (_raw.get("hosts") or {}).get(config.host, {}) _host_has_key = bool(_host_block.get("apiKey")) @@ -819,6 +839,18 @@ def get_honcho_client(config: HonchoClientConfig | None = None) -> Honcho: else: effective_api_key = config.api_key + # The Honcho SDK's route builders (e.g. routes.workspaces()) already + # include the version prefix (e.g. "/v3/workspaces"). When a user-supplied + # base_url already ends in a version segment (e.g. + # "http://localhost:38000/v3", "https://honcho.my.ts.net/v3"), concatenating + # the two produces "/v3/v3/workspaces" → 404 on every call. This is a pure + # routing concern independent of host, so strip a trailing version segment + # from ANY base_url — loopback, LAN, custom domain, or cloud alike. The + # SDK then appends its own versioned paths correctly. + if resolved_base_url: + import re as _re + resolved_base_url = _re.sub(r"/v\d+/*$", "", resolved_base_url).rstrip("/") + kwargs: dict = { "workspace_id": config.workspace_id, "api_key": effective_api_key, diff --git a/plugins/memory/mem0/__init__.py b/plugins/memory/mem0/__init__.py index 32d1f6ff700..332b3ac9412 100644 --- a/plugins/memory/mem0/__init__.py +++ b/plugins/memory/mem0/__init__.py @@ -155,7 +155,8 @@ class Mem0MemoryProvider(MemoryProvider): except Exception: pass existing.update(values) - config_path.write_text(json.dumps(existing, indent=2)) + from utils import atomic_json_write + atomic_json_write(config_path, existing, mode=0o600) def get_config_schema(self): return [ diff --git a/plugins/memory/supermemory/__init__.py b/plugins/memory/supermemory/__init__.py index 35b5b6fd649..a21ae53cc06 100644 --- a/plugins/memory/supermemory/__init__.py +++ b/plugins/memory/supermemory/__init__.py @@ -152,7 +152,8 @@ def _save_supermemory_config(values: dict, hermes_home: str) -> None: except Exception: existing = {} existing.update(values) - config_path.write_text(json.dumps(existing, indent=2, sort_keys=True) + "\n", encoding="utf-8") + from utils import atomic_json_write + atomic_json_write(config_path, existing, mode=0o600, sort_keys=True) def _detect_category(text: str) -> str: diff --git a/tests/gateway/test_agent_cache.py b/tests/gateway/test_agent_cache.py index 0c6e2df3bd9..37f8b51a458 100644 --- a/tests/gateway/test_agent_cache.py +++ b/tests/gateway/test_agent_cache.py @@ -276,6 +276,111 @@ class TestExtractCacheBustingConfig: assert out["tools.registry_generation"] == 12345 + + def test_skips_honcho_config_read_when_provider_is_not_honcho(self, monkeypatch): + """Non-Honcho gateways must not read/parse honcho.json on every message.""" + from gateway.run import GatewayRunner + + called = False + + def _boom(): + nonlocal called + called = True + raise AssertionError("should not read Honcho config") + + monkeypatch.setattr(GatewayRunner, "_extract_honcho_cache_busting_config", _boom) + + out = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "mem0"}}) + + assert called is False + assert out["honcho.peer_name"] is None + assert out["honcho.user_peer_aliases"] is None + + def test_reads_honcho_config_only_when_provider_is_honcho(self, monkeypatch): + from gateway.run import GatewayRunner + + calls = [] + + def _fake(): + calls.append(True) + return { + "honcho.peer_name": "eri", + "honcho.ai_peer": "hermes", + "honcho.pin_peer_name": True, + "honcho.runtime_peer_prefix": "tg_", + "honcho.user_peer_aliases": [("123", "eri")], + } + + monkeypatch.setattr(GatewayRunner, "_extract_honcho_cache_busting_config", _fake) + + out = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) + + assert calls == [True] + assert out["honcho.peer_name"] == "eri" + assert out["honcho.user_peer_aliases"] == [("123", "eri")] + + def test_memory_provider_change_busts_signature(self, monkeypatch): + """Switching memory.provider must itself change the cache-busting + signature, so the agent is rebuilt when a user swaps providers + mid-gateway (independent of the honcho.json identity keys).""" + from gateway.run import GatewayRunner + + # Neutralize honcho.json reads so the only varying input is the + # provider value itself. + monkeypatch.setattr( + GatewayRunner, + "_extract_honcho_cache_busting_config", + classmethod(lambda cls: cls._empty_honcho_cache_busting_config()), + ) + + sig_honcho = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) + sig_mem0 = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "mem0"}}) + + assert sig_honcho["memory.provider"] == "honcho" + assert sig_mem0["memory.provider"] == "mem0" + assert sig_honcho != sig_mem0 + + def test_honcho_cache_busting_config_memoized_by_mtime(self, monkeypatch, tmp_path): + """Repeated Honcho extraction for unchanged honcho.json should reuse parse result.""" + from types import SimpleNamespace + from gateway.run import GatewayRunner + + config_path = tmp_path / "honcho.json" + config_path.write_text("{}") + parse_calls = [] + + class FakeConfig: + peer_name = "eri" + ai_peer = "hermes" + pin_peer_name = False + runtime_peer_prefix = "tg_" + user_peer_aliases = {"123": "eri"} + + @classmethod + def from_global_config(cls, config_path=None): + parse_calls.append(config_path) + return cls() + + fake_client = SimpleNamespace( + HonchoClientConfig=FakeConfig, + resolve_config_path=lambda: config_path, + ) + monkeypatch.setitem(__import__("sys").modules, "plugins.memory.honcho.client", fake_client) + monkeypatch.setattr(GatewayRunner, "_HONCHO_CACHE_BUSTING_MEMO", {}) + + first = GatewayRunner._extract_honcho_cache_busting_config() + second = GatewayRunner._extract_honcho_cache_busting_config() + + assert first == second + assert first["honcho.user_peer_aliases"] == [("123", "eri")] + assert parse_calls == [config_path] + + config_path.write_text("{\n \"changed\": true\n}") + third = GatewayRunner._extract_honcho_cache_busting_config() + + assert third == first + assert parse_calls == [config_path, config_path] + def test_full_round_trip_busts_cache_on_real_edit(self): """End-to-end: simulate a config edit on main and verify the extracted cache_keys change produces a new signature.""" diff --git a/tests/hermes_cli/test_memory_setup_provider_arg.py b/tests/hermes_cli/test_memory_setup_provider_arg.py new file mode 100644 index 00000000000..6dd310094b5 --- /dev/null +++ b/tests/hermes_cli/test_memory_setup_provider_arg.py @@ -0,0 +1,50 @@ +"""Tests for `hermes memory setup [provider]` routing. + +The `memory setup` subcommand accepts an optional positional ``provider`` so a +fresh install can configure a specific provider directly (e.g. +``hermes memory setup honcho``) without the interactive picker — which matters +because the per-provider ``hermes `` subcommand is only registered +once that provider is active. +""" + +from types import SimpleNamespace +from unittest.mock import patch + +from hermes_cli import memory_setup + + +class TestMemorySetupProviderRouting: + def test_setup_with_provider_arg_skips_picker(self): + """`memory setup honcho` routes straight to cmd_setup_provider.""" + args = SimpleNamespace(memory_command="setup", provider="honcho") + with patch.object(memory_setup, "cmd_setup_provider") as direct, \ + patch.object(memory_setup, "cmd_setup") as picker: + memory_setup.memory_command(args) + direct.assert_called_once_with("honcho") + picker.assert_not_called() + + def test_setup_without_provider_runs_picker(self): + """`memory setup` (no provider) runs the interactive picker.""" + args = SimpleNamespace(memory_command="setup", provider=None) + with patch.object(memory_setup, "cmd_setup_provider") as direct, \ + patch.object(memory_setup, "cmd_setup") as picker: + memory_setup.memory_command(args) + picker.assert_called_once_with(args) + direct.assert_not_called() + + def test_setup_with_missing_provider_attr_runs_picker(self): + """A SimpleNamespace lacking `provider` must not crash — fall back to picker.""" + args = SimpleNamespace(memory_command="setup") + with patch.object(memory_setup, "cmd_setup_provider") as direct, \ + patch.object(memory_setup, "cmd_setup") as picker: + memory_setup.memory_command(args) + picker.assert_called_once_with(args) + direct.assert_not_called() + + def test_unknown_provider_reports_and_returns_early(self, capsys): + """An unknown provider name surfaces a helpful message and returns + before any config load/save (the not-found guard precedes those imports).""" + memory_setup.cmd_setup_provider("notaprovider") + out = capsys.readouterr().out + assert "not found" in out + assert "hermes memory setup" in out diff --git a/tests/hermes_cli/test_profiles.py b/tests/hermes_cli/test_profiles.py index 22e36d42123..dd336030928 100644 --- a/tests/hermes_cli/test_profiles.py +++ b/tests/hermes_cli/test_profiles.py @@ -754,8 +754,8 @@ class TestRenameProfile: cfg = json.loads(honcho_path.read_text()) assert "hermes.ssi_health" not in cfg["hosts"] - assert cfg["hosts"]["hermes.heimdall"]["aiPeer"] == "ssi_health" - assert cfg["hosts"]["hermes.heimdall"]["peerName"] == "user-peer" + assert cfg["hosts"]["hermes_heimdall"]["aiPeer"] == "ssi_health" + assert cfg["hosts"]["hermes_heimdall"]["peerName"] == "user-peer" def test_pins_ai_peer_when_absent_on_honcho_host_rename(self, profile_env): tmp_path = profile_env @@ -772,8 +772,8 @@ class TestRenameProfile: cfg = json.loads(honcho_path.read_text()) assert "hermes.ssi_health" not in cfg["hosts"] - assert cfg["hosts"]["hermes.heimdall"]["aiPeer"] == "ssi_health" - assert cfg["hosts"]["hermes.heimdall"]["workspace"] == "hermes" + assert cfg["hosts"]["hermes_heimdall"]["aiPeer"] == "ssi_health" + assert cfg["hosts"]["hermes_heimdall"]["workspace"] == "hermes" def test_does_not_overwrite_existing_honcho_host_on_rename(self, profile_env): tmp_path = profile_env @@ -782,7 +782,7 @@ class TestRenameProfile: honcho_path.write_text(json.dumps({ "hosts": { "hermes.ssi_health": {"aiPeer": "ssi_health"}, - "hermes.heimdall": {"aiPeer": "heimdall"}, + "hermes_heimdall": {"aiPeer": "heimdall"}, } })) @@ -791,7 +791,7 @@ class TestRenameProfile: cfg = json.loads(honcho_path.read_text()) assert cfg["hosts"]["hermes.ssi_health"]["aiPeer"] == "ssi_health" - assert cfg["hosts"]["hermes.heimdall"]["aiPeer"] == "heimdall" + assert cfg["hosts"]["hermes_heimdall"]["aiPeer"] == "heimdall" def test_default_raises_value_error(self, profile_env): with pytest.raises(ValueError, match="default"): diff --git a/tests/honcho_plugin/test_cli.py b/tests/honcho_plugin/test_cli.py index 8244badc2f6..74b7e1bc34e 100644 --- a/tests/honcho_plugin/test_cli.py +++ b/tests/honcho_plugin/test_cli.py @@ -1,6 +1,7 @@ """Tests for plugins/memory/honcho/cli.py.""" from types import SimpleNamespace +import json class TestResolveApiKey: @@ -100,6 +101,84 @@ class TestResolveApiKey: f"expected local sentinel for legacy schemeless {legacy!r}" +class TestCmdSetupLocalJwt: + """Local-deployment setup must allow configuring a JWT for AUTH_JWT_SECRET-backed Honcho servers.""" + + def _run_setup(self, monkeypatch, tmp_path, initial_cfg, prompt_answers): + import plugins.memory.honcho.cli as honcho_cli + + # Avoid touching real config / SDK / filesystem. + cfg_path = tmp_path / "honcho.json" + monkeypatch.setattr(honcho_cli, "_read_config", lambda: dict(initial_cfg)) + monkeypatch.setattr(honcho_cli, "_local_config_path", lambda: cfg_path) + monkeypatch.setattr(honcho_cli, "_config_path", lambda: cfg_path) + monkeypatch.setattr(honcho_cli, "_host_key", lambda: "hermes") + monkeypatch.setattr(honcho_cli, "_ensure_sdk_installed", lambda: True) + + written = {} + + def _capture_write(cfg, path=None): + written["cfg"] = cfg + written["path"] = path + + monkeypatch.setattr(honcho_cli, "_write_config", _capture_write) + + # Feed scripted prompt answers in order. + answers = list(prompt_answers) + + def _fake_prompt(label, default=None, secret=False): + if not answers: + # Default-through any remaining prompts to keep the wizard moving. + return default or "" + return answers.pop(0) + + monkeypatch.setattr(honcho_cli, "_prompt", _fake_prompt) + + honcho_cli.cmd_setup(SimpleNamespace()) + return written.get("cfg") + + def test_local_setup_stores_jwt_under_host_block(self, monkeypatch, tmp_path): + """Self-hosted users supplying a JWT must have it written under hosts..apiKey, + not as the top-level cloud apiKey, so cloud/hybrid switching is preserved and + get_honcho_client treats it as an explicit local auth opt-in.""" + cfg = self._run_setup( + monkeypatch, + tmp_path, + initial_cfg={}, + prompt_answers=[ + "local", # deployment + "http://localhost:8000", # base URL + "my-local-jwt-token", # local JWT + ], + ) + assert cfg is not None + assert cfg.get("baseUrl") == "http://localhost:8000" + # Top-level apiKey must remain unset (cloud field). + assert not cfg.get("apiKey") + # The new local JWT belongs under the host block. + host_block = (cfg.get("hosts") or {}).get("hermes") or {} + assert host_block.get("apiKey") == "my-local-jwt-token" + + def test_local_setup_blank_jwt_keeps_local_no_auth(self, monkeypatch, tmp_path): + """Blank JWT prompt response on a fresh local config must not introduce an apiKey + anywhere (local no-auth Honcho deployments must still work out of the box).""" + cfg = self._run_setup( + monkeypatch, + tmp_path, + initial_cfg={}, + prompt_answers=[ + "local", + "http://localhost:8000", + "", # blank JWT + ], + ) + assert cfg is not None + assert cfg.get("baseUrl") == "http://localhost:8000" + assert not cfg.get("apiKey") + host_block = (cfg.get("hosts") or {}).get("hermes") or {} + assert not host_block.get("apiKey") + + class TestCmdStatus: def test_reports_connection_failure_when_session_setup_fails(self, monkeypatch, capsys, tmp_path): import plugins.memory.honcho.cli as honcho_cli @@ -192,7 +271,7 @@ class TestCloneHonchoForProfile: honcho_cli, written = self._setup_clone_env(monkeypatch, tmp_path, cfg) ok = honcho_cli.clone_honcho_for_profile("coder") assert ok is True - new_block = written["cfg"]["hosts"]["hermes.coder"] + new_block = written["cfg"]["hosts"]["hermes_coder"] assert new_block["userPeerAliases"] == {"86701400": "eri", "discord-491827364": "eri"} def test_runtime_peer_prefix_carries_into_cloned_profile(self, monkeypatch, tmp_path): @@ -208,7 +287,7 @@ class TestCloneHonchoForProfile: honcho_cli, written = self._setup_clone_env(monkeypatch, tmp_path, cfg) ok = honcho_cli.clone_honcho_for_profile("coder") assert ok is True - new_block = written["cfg"]["hosts"]["hermes.coder"] + new_block = written["cfg"]["hosts"]["hermes_coder"] assert new_block["runtimePeerPrefix"] == "telegram_" def test_pin_peer_name_carries_into_cloned_profile(self, monkeypatch, tmp_path): @@ -224,7 +303,7 @@ class TestCloneHonchoForProfile: honcho_cli, written = self._setup_clone_env(monkeypatch, tmp_path, cfg) ok = honcho_cli.clone_honcho_for_profile("coder") assert ok is True - new_block = written["cfg"]["hosts"]["hermes.coder"] + new_block = written["cfg"]["hosts"]["hermes_coder"] assert new_block["pinPeerName"] is True def test_unset_identity_keys_do_not_appear_in_cloned_profile(self, monkeypatch, tmp_path): @@ -235,7 +314,7 @@ class TestCloneHonchoForProfile: honcho_cli, written = self._setup_clone_env(monkeypatch, tmp_path, cfg) ok = honcho_cli.clone_honcho_for_profile("coder") assert ok is True - new_block = written["cfg"]["hosts"]["hermes.coder"] + new_block = written["cfg"]["hosts"]["hermes_coder"] assert "userPeerAliases" not in new_block assert "runtimePeerPrefix" not in new_block assert "pinPeerName" not in new_block @@ -572,5 +651,5 @@ class TestCloneCarriesPinUserPeer: ok = honcho_cli.clone_honcho_for_profile("partner") assert ok is True - new_block = written["cfg"]["hosts"]["hermes.partner"] + new_block = written["cfg"]["hosts"]["hermes_partner"] assert new_block["pinUserPeer"] is True diff --git a/tests/honcho_plugin/test_client.py b/tests/honcho_plugin/test_client.py index a02e6937a34..929df4283f6 100644 --- a/tests/honcho_plugin/test_client.py +++ b/tests/honcho_plugin/test_client.py @@ -13,6 +13,7 @@ import pytest from plugins.memory.honcho.client import ( HonchoClientConfig, get_honcho_client, + profile_host_key, reset_honcho_client, resolve_active_host, resolve_config_path, @@ -430,6 +431,10 @@ class TestResolveConfigPath: class TestResolveActiveHost: + def test_profile_host_key_uses_honcho_safe_separator(self): + assert profile_host_key("coder") == "hermes_coder" + assert profile_host_key("default") == "hermes" + def test_default_returns_hermes(self): with patch.dict(os.environ, {}, clear=True): os.environ.pop("HERMES_HONCHO_HOST", None) @@ -444,7 +449,7 @@ class TestResolveActiveHost: with patch.dict(os.environ, {}, clear=False): os.environ.pop("HERMES_HONCHO_HOST", None) with patch("hermes_cli.profiles.get_active_profile_name", return_value="coder"): - assert resolve_active_host() == "hermes.coder" + assert resolve_active_host() == "hermes_coder" def test_default_profile_returns_hermes(self): with patch.dict(os.environ, {}, clear=False): @@ -477,10 +482,10 @@ class TestResolveActiveHost: class TestProfileScopedConfig: def test_from_env_uses_profile_host(self): with patch.dict(os.environ, {"HONCHO_API_KEY": "key"}): - config = HonchoClientConfig.from_env(host="hermes.coder") - assert config.host == "hermes.coder" + config = HonchoClientConfig.from_env(host="hermes_coder") + assert config.host == "hermes_coder" assert config.workspace_id == "hermes" # shared workspace - assert config.ai_peer == "hermes.coder" + assert config.ai_peer == "hermes_coder" def test_from_env_default_workspace_preserved_for_default_host(self): with patch.dict(os.environ, {"HONCHO_API_KEY": "key"}): @@ -494,22 +499,35 @@ class TestProfileScopedConfig: "apiKey": "shared-key", "hosts": { "hermes": {"aiPeer": "hermes", "peerName": "alice"}, - "hermes.coder": { - "aiPeer": "hermes.coder", + "hermes_coder": { + "aiPeer": "hermes_coder", "peerName": "alice-coder", "workspace": "coder-ws", }, }, })) config = HonchoClientConfig.from_global_config( - host="hermes.coder", config_path=config_file, + host="hermes_coder", config_path=config_file, ) - assert config.host == "hermes.coder" + assert config.host == "hermes_coder" assert config.workspace_id == "coder-ws" - assert config.ai_peer == "hermes.coder" + assert config.ai_peer == "hermes_coder" assert config.peer_name == "alice-coder" def test_from_global_config_auto_resolves_host(self, tmp_path): + config_file = tmp_path / "config.json" + config_file.write_text(json.dumps({ + "apiKey": "key", + "hosts": { + "hermes_dreamer": {"peerName": "dreamer-user"}, + }, + })) + with patch("plugins.memory.honcho.client.resolve_active_host", return_value="hermes_dreamer"): + config = HonchoClientConfig.from_global_config(config_path=config_file) + assert config.host == "hermes_dreamer" + assert config.peer_name == "dreamer-user" + + def test_from_global_config_reads_legacy_dot_profile_host_block(self, tmp_path): config_file = tmp_path / "config.json" config_file.write_text(json.dumps({ "apiKey": "key", @@ -517,10 +535,13 @@ class TestProfileScopedConfig: "hermes.dreamer": {"peerName": "dreamer-user"}, }, })) - with patch("plugins.memory.honcho.client.resolve_active_host", return_value="hermes.dreamer"): - config = HonchoClientConfig.from_global_config(config_path=config_file) - assert config.host == "hermes.dreamer" + config = HonchoClientConfig.from_global_config( + host="hermes_dreamer", + config_path=config_file, + ) + assert config.host == "hermes_dreamer" assert config.peer_name == "dreamer-user" + assert config.workspace_id == "hermes_dreamer" class TestObservationModeMigration: @@ -890,3 +911,176 @@ class TestDialecticDepthParsing: })) config = HonchoClientConfig.from_global_config(config_path=config_file) assert config.dialectic_depth_levels == ["low", "high"] + + +class TestGetHonchoClientBaseUrlDoublePrefixFix: + """Regression tests for #20688 — Honcho SDK double-prefixing of /v3 for + self-hosted instances where base_url already contains a version path.""" + + def teardown_method(self): + reset_honcho_client() + + @pytest.mark.skipif( + not importlib.util.find_spec("honcho"), + reason="honcho SDK not installed" + ) + def test_local_base_url_with_v3_suffix_stripped(self): + """base_url 'http://localhost:38000/v3' must become 'http://localhost:38000' + before passing to the Honcho SDK to avoid double '/v3/v3' prefixing.""" + fake_honcho = MagicMock(name="Honcho") + cfg = HonchoClientConfig( + api_key=None, + base_url="http://localhost:38000/v3", + workspace_id="hermes", + environment="production", + ) + + with patch("honcho.Honcho", return_value=fake_honcho) as mock_honcho, \ + patch("hermes_cli.config.load_config", return_value={}): + get_honcho_client(cfg) + + mock_honcho.assert_called_once() + passed_base_url = mock_honcho.call_args.kwargs.get("base_url") + assert passed_base_url == "http://localhost:38000", ( + f"Expected 'http://localhost:38000', got {passed_base_url!r}" + ) + + @pytest.mark.skipif( + not importlib.util.find_spec("honcho"), + reason="honcho SDK not installed" + ) + def test_local_base_url_without_version_unchanged(self): + """base_url 'http://localhost:38000' (no version) must be passed unchanged.""" + fake_honcho = MagicMock(name="Honcho") + cfg = HonchoClientConfig( + api_key=None, + base_url="http://localhost:38000", + workspace_id="hermes", + environment="production", + ) + + with patch("honcho.Honcho", return_value=fake_honcho) as mock_honcho, \ + patch("hermes_cli.config.load_config", return_value={}): + get_honcho_client(cfg) + + mock_honcho.assert_called_once() + passed_base_url = mock_honcho.call_args.kwargs.get("base_url") + assert passed_base_url == "http://localhost:38000", ( + f"Expected 'http://localhost:38000', got {passed_base_url!r}" + ) + + @pytest.mark.skipif( + not importlib.util.find_spec("honcho"), + reason="honcho SDK not installed" + ) + def test_cloud_base_url_without_version_unchanged(self): + """A cloud base_url with no version segment must pass through untouched.""" + fake_honcho = MagicMock(name="Honcho") + cfg = HonchoClientConfig( + api_key="cloud-key", + base_url="https://api.honcho.dev", + workspace_id="hermes", + environment="production", + ) + + with patch("honcho.Honcho", return_value=fake_honcho) as mock_honcho, \ + patch("hermes_cli.config.load_config", return_value={}): + get_honcho_client(cfg) + + mock_honcho.assert_called_once() + passed_base_url = mock_honcho.call_args.kwargs.get("base_url") + assert passed_base_url == "https://api.honcho.dev", ( + f"Expected 'https://api.honcho.dev', got {passed_base_url!r}" + ) + + @pytest.mark.skipif( + not importlib.util.find_spec("honcho"), + reason="honcho SDK not installed" + ) + def test_cloud_base_url_with_version_stripped(self): + """A version segment double-prefixes regardless of host, so a cloud + base_url that ends in '/v3' must also be stripped (the SDK re-adds it).""" + fake_honcho = MagicMock(name="Honcho") + cfg = HonchoClientConfig( + api_key="cloud-key", + base_url="https://api.honcho.dev/v3", + workspace_id="hermes", + environment="production", + ) + + with patch("honcho.Honcho", return_value=fake_honcho) as mock_honcho, \ + patch("hermes_cli.config.load_config", return_value={}): + get_honcho_client(cfg) + + mock_honcho.assert_called_once() + passed_base_url = mock_honcho.call_args.kwargs.get("base_url") + assert passed_base_url == "https://api.honcho.dev", ( + f"Expected 'https://api.honcho.dev', got {passed_base_url!r}" + ) + + @pytest.mark.skipif( + not importlib.util.find_spec("honcho"), + reason="honcho SDK not installed" + ) + @pytest.mark.parametrize( + "raw_url, expected", + [ + # LAN IP self-host + ("http://10.0.0.5:8000/v3", "http://10.0.0.5:8000"), + ("http://192.168.1.20:38000/v3/", "http://192.168.1.20:38000"), + # Tailscale / custom-domain self-host + ("https://honcho.my.ts.net/v3", "https://honcho.my.ts.net"), + ("https://honcho.lab.internal/v3", "https://honcho.lab.internal"), + ("https://honcho.fly.dev/v3", "https://honcho.fly.dev"), + # higher version segments are also stripped + ("https://honcho.lab.internal/v12", "https://honcho.lab.internal"), + # self-host without a version segment is left unchanged + ("https://honcho.my.ts.net", "https://honcho.my.ts.net"), + ("http://10.0.0.5:8000", "http://10.0.0.5:8000"), + ], + ) + def test_self_hosted_base_url_version_stripped(self, raw_url, expected): + """Non-loopback self-hosted instances (LAN IPs, Tailscale, custom + domains) must get the same version-segment stripping as localhost. + Regression for #20688 recurring on any non-loopback self-host.""" + fake_honcho = MagicMock(name="Honcho") + cfg = HonchoClientConfig( + api_key="self-host-key", + base_url=raw_url, + workspace_id="hermes", + environment="production", + ) + + with patch("honcho.Honcho", return_value=fake_honcho) as mock_honcho, \ + patch("hermes_cli.config.load_config", return_value={}): + get_honcho_client(cfg) + + mock_honcho.assert_called_once() + passed_base_url = mock_honcho.call_args.kwargs.get("base_url") + assert passed_base_url == expected, ( + f"Expected {expected!r}, got {passed_base_url!r}" + ) + + @pytest.mark.skipif( + not importlib.util.find_spec("honcho"), + reason="honcho SDK not installed" + ) + def test_local_base_url_with_trailing_slash_stripped(self): + """base_url 'http://127.0.0.1:38000/v3/' must also be cleaned up.""" + fake_honcho = MagicMock(name="Honcho") + cfg = HonchoClientConfig( + api_key=None, + base_url="http://127.0.0.1:38000/v3/", + workspace_id="hermes", + environment="production", + ) + + with patch("honcho.Honcho", return_value=fake_honcho) as mock_honcho, \ + patch("hermes_cli.config.load_config", return_value={}): + get_honcho_client(cfg) + + mock_honcho.assert_called_once() + passed_base_url = mock_honcho.call_args.kwargs.get("base_url") + assert passed_base_url == "http://127.0.0.1:38000", ( + f"Expected 'http://127.0.0.1:38000', got {passed_base_url!r}" + ) diff --git a/tests/honcho_plugin/test_pin_peer_name.py b/tests/honcho_plugin/test_pin_peer_name.py index ef3a215f329..1e72bc97d1a 100644 --- a/tests/honcho_plugin/test_pin_peer_name.py +++ b/tests/honcho_plugin/test_pin_peer_name.py @@ -745,10 +745,10 @@ class TestPinTransition: monkeypatch.setenv("HERMES_HOME", str(tmp_path)) cfg_path.write_text(json.dumps({"apiKey": "k", "peerName": "Igor", "pinPeerName": True})) - sig_pinned = GatewayRunner._extract_cache_busting_config({}) + sig_pinned = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) cfg_path.write_text(json.dumps({"apiKey": "k", "peerName": "Igor", "pinPeerName": False})) - sig_unpinned = GatewayRunner._extract_cache_busting_config({}) + sig_unpinned = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) assert sig_pinned["honcho.pin_peer_name"] != sig_unpinned["honcho.pin_peer_name"] @@ -759,14 +759,14 @@ class TestPinTransition: monkeypatch.setenv("HERMES_HOME", str(tmp_path)) cfg_path.write_text(json.dumps({"apiKey": "k", "peerName": "Igor"})) - sig_no_aliases = GatewayRunner._extract_cache_busting_config({}) + sig_no_aliases = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) cfg_path.write_text(json.dumps({ "apiKey": "k", "peerName": "Igor", "userPeerAliases": {"86701400": "Igor"}, })) - sig_with_aliases = GatewayRunner._extract_cache_busting_config({}) + sig_with_aliases = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) assert sig_no_aliases["honcho.user_peer_aliases"] != sig_with_aliases["honcho.user_peer_aliases"] @@ -777,14 +777,14 @@ class TestPinTransition: monkeypatch.setenv("HERMES_HOME", str(tmp_path)) cfg_path.write_text(json.dumps({"apiKey": "k", "peerName": "Igor"})) - sig_no_prefix = GatewayRunner._extract_cache_busting_config({}) + sig_no_prefix = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) cfg_path.write_text(json.dumps({ "apiKey": "k", "peerName": "Igor", "runtimePeerPrefix": "telegram_", })) - sig_with_prefix = GatewayRunner._extract_cache_busting_config({}) + sig_with_prefix = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) assert sig_no_prefix["honcho.runtime_peer_prefix"] != sig_with_prefix["honcho.runtime_peer_prefix"] @@ -805,14 +805,14 @@ class TestPinTransition: "peerName": "Igor", "aiPeer": "hermes", })) - sig_before = GatewayRunner._extract_cache_busting_config({}) + sig_before = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) cfg_path.write_text(json.dumps({ "apiKey": "k", "peerName": "Igor", "aiPeer": "hermetika", })) - sig_after = GatewayRunner._extract_cache_busting_config({}) + sig_after = GatewayRunner._extract_cache_busting_config({"memory": {"provider": "honcho"}}) assert sig_before["honcho.ai_peer"] != sig_after["honcho.ai_peer"] diff --git a/tests/plugins/memory/test_hindsight_provider.py b/tests/plugins/memory/test_hindsight_provider.py index bc62b7f2c8f..f49c227611a 100644 --- a/tests/plugins/memory/test_hindsight_provider.py +++ b/tests/plugins/memory/test_hindsight_provider.py @@ -6,7 +6,9 @@ turn counting, tags), and schema completeness. """ import json +import os import re +import stat import sys from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock @@ -1570,3 +1572,13 @@ class TestShutdown: assert embedded._client is None assert provider._client is None + +@pytest.mark.skipif(os.name == "nt", reason="POSIX mode bits not enforced on Windows") +def test_save_config_sets_owner_only_permissions(tmp_path): + """hindsight/config.json must be written with 0o600 so API key is not world-readable.""" + provider = HindsightMemoryProvider() + provider.save_config({"api_key": "hd-test-key"}, str(tmp_path)) + config_file = tmp_path / "hindsight" / "config.json" + assert config_file.exists() + mode = stat.S_IMODE(config_file.stat().st_mode) + assert mode == 0o600, f"Expected 0o600 (owner-only), got {oct(mode)}" diff --git a/tests/plugins/memory/test_mem0_v2.py b/tests/plugins/memory/test_mem0_v2.py index 1ef85499b54..a9a86676452 100644 --- a/tests/plugins/memory/test_mem0_v2.py +++ b/tests/plugins/memory/test_mem0_v2.py @@ -4,6 +4,10 @@ Salvaged from PRs #5301 (qaqcvc) and #5117 (vvvanguards). """ import json +import os +import stat + +import pytest from plugins.memory.mem0 import Mem0MemoryProvider @@ -202,6 +206,17 @@ class TestMem0ResponseUnwrapping: # --------------------------------------------------------------------------- +@pytest.mark.skipif(os.name == "nt", reason="POSIX mode bits not enforced on Windows") +def test_save_config_sets_owner_only_permissions(tmp_path): + """mem0.json must be written with 0o600 so API key is not world-readable.""" + provider = Mem0MemoryProvider() + provider.save_config({"api_key": "m0-test-key"}, str(tmp_path)) + config_file = tmp_path / "mem0.json" + assert config_file.exists() + mode = stat.S_IMODE(config_file.stat().st_mode) + assert mode == 0o600, f"Expected 0o600 (owner-only), got {oct(mode)}" + + class TestMem0Defaults: """Ensure we don't break existing users' defaults.""" diff --git a/tests/plugins/memory/test_supermemory_provider.py b/tests/plugins/memory/test_supermemory_provider.py index 0aee459757f..d5f1c5bb174 100644 --- a/tests/plugins/memory/test_supermemory_provider.py +++ b/tests/plugins/memory/test_supermemory_provider.py @@ -1,4 +1,6 @@ import json +import os +import stat import threading import pytest @@ -409,3 +411,13 @@ def test_get_config_schema_minimal(): assert len(schema) == 1 assert schema[0]["key"] == "api_key" assert schema[0]["secret"] is True + + +@pytest.mark.skipif(os.name == "nt", reason="POSIX mode bits not enforced on Windows") +def test_save_config_sets_owner_only_permissions(tmp_path): + """supermemory.json must be written with 0o600 so API key is not world-readable.""" + _save_supermemory_config({"api_key": "sm-test-key"}, str(tmp_path)) + config_file = tmp_path / "supermemory.json" + assert config_file.exists() + mode = stat.S_IMODE(config_file.stat().st_mode) + assert mode == 0o600, f"Expected 0o600 (owner-only), got {oct(mode)}" diff --git a/tests/test_honcho_client_config.py b/tests/test_honcho_client_config.py index d4c62d610e9..f7b1efa151c 100644 --- a/tests/test_honcho_client_config.py +++ b/tests/test_honcho_client_config.py @@ -2,9 +2,13 @@ import json import os +import stat +from pathlib import Path +import pytest from plugins.memory.honcho.client import HonchoClientConfig +from plugins.memory.honcho import HonchoMemoryProvider class TestHonchoClientConfigAutoEnable: @@ -100,3 +104,24 @@ class TestHonchoClientConfigAutoEnable: assert cfg.api_key == "fallback-key" assert cfg.enabled is True # from_env() sets enabled=True + + +@pytest.mark.skipif(os.name == "nt", reason="POSIX mode bits not enforced on Windows") +def test_save_config_sets_owner_only_permissions(tmp_path, monkeypatch): + """honcho.json is created atomically with 0o600, not chmod-after-write.""" + import utils + calls = [] + real_atomic = utils.atomic_json_write + + def spy(path, data, **kwargs): + calls.append(kwargs.get("mode")) + return real_atomic(path, data, **kwargs) + + monkeypatch.setattr(utils, "atomic_json_write", spy) + provider = HonchoMemoryProvider() + provider.save_config({"api_key": "hc-test-key"}, str(tmp_path)) + assert calls == [0o600] + config_file = tmp_path / "honcho.json" + assert config_file.exists() + mode = stat.S_IMODE(config_file.stat().st_mode) + assert mode == 0o600, f"Expected 0o600 (owner-only), got {oct(mode)}" diff --git a/utils.py b/utils.py index 156fd38bdc3..cb08ba12869 100644 --- a/utils.py +++ b/utils.py @@ -87,6 +87,7 @@ def atomic_json_write( data: Any, *, indent: int = 2, + mode: int | None = None, **dump_kwargs: Any, ) -> None: """Write JSON data to a file atomically. @@ -99,13 +100,16 @@ def atomic_json_write( path: Target file path (will be created or overwritten). data: JSON-serializable data to write. indent: JSON indentation (default 2). + mode: Optional final permission mode. When set, the temp file is + created and replaced with this mode, avoiding chmod-after-write + TOCTOU exposure for secret-bearing files. **dump_kwargs: Additional keyword args forwarded to json.dump(), such as default=str for non-native types. """ path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) - original_mode = _preserve_file_mode(path) + original_mode = None if mode is not None else _preserve_file_mode(path) fd, tmp_path = tempfile.mkstemp( dir=str(path.parent), @@ -113,6 +117,8 @@ def atomic_json_write( suffix=".tmp", ) try: + if mode is not None: + os.fchmod(fd, mode) with os.fdopen(fd, "w", encoding="utf-8") as f: json.dump( data, @@ -125,7 +131,13 @@ def atomic_json_write( os.fsync(f.fileno()) # Preserve symlinks — swap in-place on the real file (GitHub #16743). real_path = atomic_replace(tmp_path, path) - _restore_file_mode(real_path, original_mode) + if mode is not None: + try: + os.chmod(real_path, mode) + except OSError: + pass + else: + _restore_file_mode(Path(real_path), original_mode) except BaseException: # Intentionally catch BaseException so temp-file cleanup still runs for # KeyboardInterrupt/SystemExit before re-raising the original signal. diff --git a/website/docs/user-guide/features/honcho.md b/website/docs/user-guide/features/honcho.md index 61dd73e8f2e..b971bea272d 100644 --- a/website/docs/user-guide/features/honcho.md +++ b/website/docs/user-guide/features/honcho.md @@ -106,6 +106,10 @@ The auto-injected dialectic scales `dialecticReasoningLevel` by query length: +1 Honcho is configured in `~/.honcho/config.json` (global) or `$HERMES_HOME/honcho.json` (profile-local). The setup wizard handles this for you. +### Self-Hosted Honcho with Authentication + +When pointing Hermes at a self-hosted Honcho server, `hermes honcho setup` (and `hermes memory setup`) ask for a **local JWT / bearer token** after the base URL. Paste a JWT signed with the server's `AUTH_JWT_SECRET` (the Honcho compose env var) to enable authenticated access; leave it blank for servers running with `AUTH_USE_AUTH=false`. The local token is stored under the host block (`hosts..apiKey` in `honcho.json`), separate from any cloud `apiKey`, so you can flip the `Cloud or local?` prompt back to `cloud` later without losing either credential. + ### Full Config Reference | Key | Default | Description | @@ -199,11 +203,12 @@ When Honcho is active as the memory provider, five tools become available: ## CLI Commands -The `hermes honcho` subcommand is **only registered when Honcho is the active memory provider** (`memory.provider: honcho` in `config.yaml`). Run `hermes memory setup` and pick Honcho first; the subcommand appears on the next invocation. +The `hermes honcho` subcommand is **only registered when Honcho is the active memory provider** (`memory.provider: honcho` in `config.yaml`). On a fresh install, configure Honcho directly with `hermes memory setup honcho` (or run `hermes memory setup` and pick it from the list); the `hermes honcho` subcommand then appears on the next invocation. ```bash +hermes memory setup honcho # Configure Honcho directly (works before activation) hermes honcho status # Connection status, config, and key settings -hermes honcho setup # Redirects to `hermes memory setup` +hermes honcho setup # Redirects to `hermes memory setup` (post-activation alias) hermes honcho strategy # Show or set session strategy (per-session/per-directory/per-repo/global) hermes honcho peer # Show or update peer names + dialectic reasoning level hermes honcho mode # Show or set recall mode (hybrid/context/tools) diff --git a/website/docs/user-guide/features/memory-providers.md b/website/docs/user-guide/features/memory-providers.md index f584c7288a8..00f2555d620 100644 --- a/website/docs/user-guide/features/memory-providers.md +++ b/website/docs/user-guide/features/memory-providers.md @@ -66,7 +66,7 @@ AI-native cross-session user modeling with dialectic reasoning, session-scoped c hermes memory setup # select "honcho" — runs the Honcho-specific post-setup ``` -The legacy `hermes honcho setup` command still works (it now redirects to `hermes memory setup`), but is only registered after Honcho is selected as the active memory provider. +On a fresh install, configure Honcho directly with `hermes memory setup honcho`. The legacy `hermes honcho setup` command still works (it now redirects to `hermes memory setup`), but is only registered after Honcho is selected as the active memory provider. **Config:** `$HERMES_HOME/honcho.json` (profile-local) or `~/.honcho/config.json` (global). Resolution order: `$HERMES_HOME/honcho.json` > `~/.hermes/honcho.json` > `~/.honcho/config.json`. See the [config reference](https://github.com/NousResearch/hermes-agent/blob/main/plugins/memory/honcho/README.md) and the [Honcho integration guide](https://docs.honcho.dev/v3/guides/integrations/hermes). From 0437137fff821854066088fbcd590d7af54c6857 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 23:23:54 -0700 Subject: [PATCH 18/89] security: pin patched Starlette (>=1.0.1) for CVE-2026-48710 BadHost (#35118) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starlette < 1.0.1 is affected by CVE-2026-48710 ("BadHost", CWE-444). The HTTP Host header was not validated before being used to rebuild `request.url`, so a malformed Host could make `request.url.path` desync from the raw ASGI path the router actually dispatched. Middleware and endpoints that apply path-based authorization off `request.url` (rather than `scope["path"]`) can therefore be bypassed. Hermes pulls Starlette transitively, never directly: - [web] -> fastapi==0.133.1 (starlette>=0.40.0, no upper bound) - [mcp] -> mcp==1.26.0 + sse-starlette (starlette>=0.27 / >=0.49.1) - [computer-use] -> mcp==1.26.0 - [dev] -> mcp==1.26.0 A fresh resolve landed starlette 0.52.1 — vulnerable. With no upper bound on the transitive specs, pip/uv could resolve any pre-1.0.1 release on a fresh install. Fix: pin starlette==1.0.1 directly in every extra that exposes a Starlette-backed server surface, regenerate uv.lock (only starlette moves: 0.52.1 -> 1.0.1, hash-verified), and mirror the pin in the lazy-install map (tools/lazy_deps.py `tool.dashboard`) so `hermes` on-demand dashboard installs can't re-resolve a vulnerable version. 1.0.1 is the advisory's named fix floor and the oldest patched release (more bake time than 1.1.0/1.2.0, which are days old); it satisfies every carrier constraint and our requires-python>=3.11. Scope note: this is a dependency-level fix complementing the application-layer Host-header validator added in #34162 (`hermes_cli/web_server.py` `_is_accepted_host`). Defense in depth at both the framework and app layers. Guards: two invariant tests in tests/test_packaging_metadata.py assert every server-surface extra pins starlette and that pyproject + uv.lock both resolve >= the 1.0.1 CVE floor — a dropped pin or stale lock fails in CI instead of shipping the bypass. Closes #35067 --- pyproject.toml | 17 +++++-- tests/test_packaging_metadata.py | 85 ++++++++++++++++++++++++++++++++ tools/lazy_deps.py | 1 + uv.lock | 17 +++++-- 4 files changed, 113 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f3f102b1df7..6f565363e5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,7 @@ edge-tts = ["edge-tts==7.2.7"] modal = ["modal==1.3.4"] daytona = ["daytona==0.155.0"] hindsight = ["hindsight-client==0.6.1"] -dev = ["debugpy==1.8.20", "pytest==9.0.2", "pytest-asyncio==1.3.0", "pytest-timeout==2.4.0", "mcp==1.26.0", "ty==0.0.21", "ruff==0.15.10", "setuptools==82.0.1"] +dev = ["debugpy==1.8.20", "pytest==9.0.2", "pytest-asyncio==1.3.0", "pytest-timeout==2.4.0", "mcp==1.26.0", "starlette==1.0.1", "ty==0.0.21", "ruff==0.15.10", "setuptools==82.0.1"] # starlette: CVE-2026-48710 messaging = ["python-telegram-bot[webhooks]==22.6", "discord.py[voice]==2.7.1", "aiohttp==3.13.3", "brotlicffi==1.2.0.1", "slack-bolt==1.27.0", "slack-sdk==3.40.1", "qrcode==7.4.2"] cron = [] # croniter is now a core dependency; this extra kept for back-compat slack = ["slack-bolt==1.27.0", "slack-sdk==3.40.1", "aiohttp==3.13.3"] @@ -108,14 +108,21 @@ pty = [ "pywinpty==2.0.15; sys_platform == 'win32'", ] honcho = ["honcho-ai==2.0.1"] -mcp = ["mcp==1.26.0"] +# CVE-2026-48710 (BadHost): Starlette is pulled transitively by mcp's +# sse-starlette / HTTP-SSE stack (and by fastapi in the `web` extra). Before +# 1.0.1, a malformed Host header makes `request.url.path` desync from the path +# the ASGI router actually dispatched, so middleware/endpoints that gate on +# `request.url` can be bypassed. We pin a patched Starlette directly in every +# extra that exposes a Starlette-backed server surface so pip/uv can't resolve +# a vulnerable pre-1.0.1 transitive. Bump in lockstep with uv.lock. +mcp = ["mcp==1.26.0", "starlette==1.0.1"] # starlette: CVE-2026-48710 homeassistant = ["aiohttp==3.13.3"] sms = ["aiohttp==3.13.3"] # Computer use — macOS background desktop control via cua-driver (MCP stdio). # The cua-driver binary itself is installed via `hermes tools` post-setup # (curl install script); this extra just pins the MCP client used to talk # to it, which is already provided by the `mcp` extra. -computer-use = ["mcp==1.26.0"] +computer-use = ["mcp==1.26.0", "starlette==1.0.1"] # starlette: CVE-2026-48710 acp = ["agent-client-protocol==0.9.0"] # mistral: Voxtral STT + TTS. Pinned to an exact verified-clean version. # The `mistralai` PyPI project was quarantined 2026-05-12 after the malicious @@ -168,7 +175,9 @@ youtube = [ "youtube-transcript-api==1.2.4", ] # `hermes dashboard` (localhost SPA + API). Not in core to keep the default install lean. -web = ["fastapi==0.133.1", "uvicorn[standard]==0.41.0"] +# starlette==1.0.1 pinned for CVE-2026-48710 (BadHost) — fastapi pulls Starlette +# transitively and pre-1.0.1 is the vulnerable range. See the mcp extra above. +web = ["fastapi==0.133.1", "uvicorn[standard]==0.41.0", "starlette==1.0.1"] all = [ # Policy (2026-05-12): `[all]` includes only extras that genuinely # CAN'T be lazy-installed via `tools/lazy_deps.py` — i.e. things every diff --git a/tests/test_packaging_metadata.py b/tests/test_packaging_metadata.py index d72c0224a69..fadb022f31f 100644 --- a/tests/test_packaging_metadata.py +++ b/tests/test_packaging_metadata.py @@ -115,3 +115,88 @@ def test_bundled_plugin_manifests_ship_in_both_wheel_and_sdist(): assert "recursive-include plugins" in manifest and "plugin.yaml" in manifest, ( "MANIFEST.in must recursive-include plugins plugin.yaml/plugin.yml (sdist)" ) + + +# Minimum non-vulnerable Starlette: CVE-2026-48710 ("BadHost") was fixed in +# 1.0.1. Anything below that lets a malformed Host header desync +# ``request.url.path`` from the dispatched ASGI path, bypassing path-based +# authz in middleware/endpoints that gate on ``request.url``. Starlette is a +# transitive dep (fastapi in [web]; sse-starlette/mcp in [mcp]/[computer-use]/ +# [dev]) so we pin it directly in every extra that exposes a server surface and +# enforce the floor in both pyproject and the committed lockfile. +_STARLETTE_CVE_FLOOR = (1, 0, 1) + + +def _version_tuple(spec: str) -> tuple[int, ...]: + # "1.0.1" -> (1, 0, 1); tolerant of pre/post suffixes by truncating. + head = spec.split("+", 1)[0] + parts = [] + for chunk in head.split("."): + digits = "".join(ch for ch in chunk if ch.isdigit()) + if not digits: + break + parts.append(int(digits)) + return tuple(parts) + + +def test_starlette_pinned_above_cve_2026_48710_floor_in_pyproject(): + """Every extra that declares Starlette must pin a patched (>=1.0.1) version. + + Regression guard for #35067 / CVE-2026-48710. A future edit that drops the + pin (re-exposing the unbounded transitive ``starlette>=0.27`` from mcp / + ``>=0.40.0`` from fastapi) or pins a pre-1.0.1 version fails here instead of + shipping a Host-header auth-bypass to dashboard / MCP-HTTP users. + """ + data = tomllib.loads((REPO_ROOT / "pyproject.toml").read_text(encoding="utf-8")) + extras = data["project"]["optional-dependencies"] + + found = {} + for extra, specs in extras.items(): + for spec in specs: + name = spec.split("==", 1)[0].split(">", 1)[0].split("<", 1)[0].split("[", 1)[0].strip() + if name.lower() == "starlette": + assert "==" in spec, f"[{extra}] must exact-pin starlette, got {spec!r}" + ver = spec.split("==", 1)[1].split(";", 1)[0].strip() + found[extra] = ver + + # The four server-surface extras must each carry the direct pin. + for extra in ("web", "mcp", "computer-use", "dev"): + assert extra in found, ( + f"[{extra}] no longer pins starlette directly — CVE-2026-48710 " + f"regression risk (mcp/fastapi pull it transitively with no upper bound)" + ) + + for extra, ver in found.items(): + assert _version_tuple(ver) >= _STARLETTE_CVE_FLOOR, ( + f"[{extra}] pins starlette=={ver}, below the CVE-2026-48710 fix " + f"floor {'.'.join(map(str, _STARLETTE_CVE_FLOOR))}" + ) + + +def test_locked_starlette_is_not_vulnerable_to_cve_2026_48710(): + """The committed uv.lock must resolve starlette to a patched version. + + pyproject pins protect the declared extras, but the lockfile is what + hash-verified installs (``uv sync --locked``) actually pull. Assert the + resolved version is >= the CVE-2026-48710 fix floor so a stale-lock + regression can't ship a vulnerable Starlette to users. + """ + lock = (REPO_ROOT / "uv.lock").read_text(encoding="utf-8") + versions = [] + in_starlette = False + for line in lock.splitlines(): + if line.startswith("[[package]]"): + in_starlette = False + elif line.strip() == 'name = "starlette"': + in_starlette = True + elif in_starlette and line.startswith("version = "): + versions.append(line.split("=", 1)[1].strip().strip('"')) + in_starlette = False + + assert versions, "starlette not found in uv.lock" + for ver in versions: + assert _version_tuple(ver) >= _STARLETTE_CVE_FLOOR, ( + f"uv.lock resolves starlette=={ver}, below the CVE-2026-48710 fix " + f"floor {'.'.join(map(str, _STARLETTE_CVE_FLOOR))} — regenerate the " + f"lockfile after bumping the pin" + ) diff --git a/tools/lazy_deps.py b/tools/lazy_deps.py index a0926a435c7..20d68f2f7f3 100644 --- a/tools/lazy_deps.py +++ b/tools/lazy_deps.py @@ -173,6 +173,7 @@ LAZY_DEPS: dict[str, tuple[str, ...]] = { "tool.dashboard": ( "fastapi==0.133.1", "uvicorn[standard]==0.41.0", + "starlette==1.0.1", # CVE-2026-48710 (BadHost) — keep lazy-install in sync with pyproject [web] ), } diff --git a/uv.lock b/uv.lock index 24205de8627..299c659fd2f 100644 --- a/uv.lock +++ b/uv.lock @@ -1640,6 +1640,7 @@ all = [ { name = "ruff" }, { name = "setuptools" }, { name = "simple-term-menu" }, + { name = "starlette" }, { name = "ty" }, { name = "uvicorn", extra = ["standard"] }, { name = "youtube-transcript-api" }, @@ -1658,6 +1659,7 @@ cli = [ ] computer-use = [ { name = "mcp" }, + { name = "starlette" }, ] daytona = [ { name = "daytona" }, @@ -1670,6 +1672,7 @@ dev = [ { name = "pytest-timeout" }, { name = "ruff" }, { name = "setuptools" }, + { name = "starlette" }, { name = "ty" }, ] dingtalk = [ @@ -1716,6 +1719,7 @@ matrix = [ ] mcp = [ { name = "mcp" }, + { name = "starlette" }, ] messaging = [ { name = "aiohttp" }, @@ -1755,6 +1759,7 @@ termux = [ { name = "python-telegram-bot", extra = ["webhooks"] }, { name = "pywinpty", marker = "sys_platform == 'win32'" }, { name = "simple-term-menu" }, + { name = "starlette" }, ] termux-all = [ { name = "agent-client-protocol" }, @@ -1769,6 +1774,7 @@ termux-all = [ { name = "python-telegram-bot", extra = ["webhooks"] }, { name = "pywinpty", marker = "sys_platform == 'win32'" }, { name = "simple-term-menu" }, + { name = "starlette" }, { name = "uvicorn", extra = ["standard"] }, ] tts-premium = [ @@ -1781,6 +1787,7 @@ voice = [ ] web = [ { name = "fastapi" }, + { name = "starlette" }, { name = "uvicorn", extra = ["standard"] }, ] wecom = [ @@ -1886,6 +1893,10 @@ requires-dist = [ { name = "slack-sdk", marker = "extra == 'messaging'", specifier = "==3.40.1" }, { name = "slack-sdk", marker = "extra == 'slack'", specifier = "==3.40.1" }, { name = "sounddevice", marker = "extra == 'voice'", specifier = "==0.5.5" }, + { name = "starlette", marker = "extra == 'computer-use'", specifier = "==1.0.1" }, + { name = "starlette", marker = "extra == 'dev'", specifier = "==1.0.1" }, + { name = "starlette", marker = "extra == 'mcp'", specifier = "==1.0.1" }, + { name = "starlette", marker = "extra == 'web'", specifier = "==1.0.1" }, { name = "tenacity", specifier = "==9.1.4" }, { name = "ty", marker = "extra == 'dev'", specifier = "==0.0.21" }, { name = "tzdata", marker = "sys_platform == 'win32'", specifier = "==2025.3" }, @@ -4084,15 +4095,15 @@ wheels = [ [[package]] name = "starlette" -version = "0.52.1" +version = "1.0.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c4/68/79977123bb7be889ad680d79a40f339082c1978b5cfcf62c2d8d196873ac/starlette-0.52.1.tar.gz", hash = "sha256:834edd1b0a23167694292e94f597773bc3f89f362be6effee198165a35d62933", size = 2653702, upload-time = "2026-01-18T13:34:11.062Z" } +sdist = { url = "https://files.pythonhosted.org/packages/08/a3/84e821cc54b4ab50ae6dbc6ac3800a651b65ec35f045cc73785380654057/starlette-1.0.1.tar.gz", hash = "sha256:512399c5f1de7fac99c88572212ded9ddeddef2fb32afa82d724000e88b38f4f", size = 2659596, upload-time = "2026-05-21T21:58:58.433Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/81/0d/13d1d239a25cbfb19e740db83143e95c772a1fe10202dda4b76792b114dd/starlette-0.52.1-py3-none-any.whl", hash = "sha256:0029d43eb3d273bc4f83a08720b4912ea4b071087a3b48db01b7c839f7954d74", size = 74272, upload-time = "2026-01-18T13:34:09.188Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e1/b2df4bc09a1e51ff664c1e17018a4274b42e5e9352e4a478ea540512dc88/starlette-1.0.1-py3-none-any.whl", hash = "sha256:7c0e69b2ee1c848bd54669d908500117a3ee13de603a21427e5c6fc1adf98dcd", size = 72802, upload-time = "2026-05-21T21:58:56.551Z" }, ] [[package]] From 7b0915037c110ca10ff4da952bae2d0d786868ac Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Sat, 30 May 2026 11:39:25 +0530 Subject: [PATCH 19/89] test: remove low-value model-catalog mirror tests These tests asserted that hardcoded curated model lists/constants still contained specific model strings (e.g. 'glm-5' in provider_model_ids('zai'), exact context-length values per model key, PROVIDER_TO_MODELS_DEV entries). They mirror a constant rather than exercise logic, so they only ever break when models are added/retired and never catch a real bug. Removed 22 such functions across 7 files (149 deletions, 0 additions). Behavioral siblings are kept: live-catalog-wins, fallback ordering, substring/longest-match resolution, normalization, credential discovery, and probe-tier stepping all still tested. --- tests/agent/test_model_metadata.py | 62 ------------------- tests/agent/test_models_dev.py | 11 ---- .../hermes_cli/test_copilot_in_model_list.py | 19 ------ tests/hermes_cli/test_gmi_provider.py | 13 ---- tests/hermes_cli/test_model_validation.py | 32 ---------- tests/hermes_cli/test_models.py | 4 -- .../hermes_cli/test_ollama_cloud_provider.py | 9 --- 7 files changed, 150 deletions(-) diff --git a/tests/agent/test_model_metadata.py b/tests/agent/test_model_metadata.py index 3f9fd56d140..5b1abfd32d0 100644 --- a/tests/agent/test_model_metadata.py +++ b/tests/agent/test_model_metadata.py @@ -123,55 +123,6 @@ class TestEstimateMessagesTokensRough: # ========================================================================= class TestDefaultContextLengths: - def test_claude_models_context_lengths(self): - for key, value in DEFAULT_CONTEXT_LENGTHS.items(): - if "claude" not in key: - continue - # Claude 4.6+ models (4.6, 4.7, 4.8) have 1M context at standard - # API pricing (no long-context premium). Older Claude 4.x and - # 3.x models cap at 200k. - if any(tag in key for tag in ("4.6", "4-6", "4.7", "4-7", "4.8", "4-8")): - assert value == 1000000, f"{key} should be 1000000" - else: - assert value == 200000, f"{key} should be 200000" - - def test_gpt4_models_128k_or_1m(self): - # gpt-4.1 and gpt-4.1-mini have 1M context; other gpt-4* have 128k - for key, value in DEFAULT_CONTEXT_LENGTHS.items(): - if "gpt-4" in key and "gpt-4.1" not in key: - assert value == 128000, f"{key} should be 128000" - - def test_gpt41_models_1m(self): - for key, value in DEFAULT_CONTEXT_LENGTHS.items(): - if "gpt-4.1" in key: - assert value == 1047576, f"{key} should be 1047576" - - def test_gemini_models_1m(self): - for key, value in DEFAULT_CONTEXT_LENGTHS.items(): - if "gemini" in key: - assert value == 1048576, f"{key} should be 1048576" - - def test_grok_models_context_lengths(self): - # xAI /v1/models does not return context_length metadata, so - # DEFAULT_CONTEXT_LENGTHS must cover the Grok family explicitly. - # Values sourced from models.dev (2026-04). - expected = { - "grok-4.20": 2000000, - "grok-4-fast": 2000000, - "grok-4": 256000, - "grok-build": 256000, - "grok-code-fast": 256000, - "grok-3": 131072, - "grok-2": 131072, - "grok-2-vision": 8192, - "grok": 131072, - } - for key, value in expected.items(): - assert key in DEFAULT_CONTEXT_LENGTHS, f"{key} missing from DEFAULT_CONTEXT_LENGTHS" - assert DEFAULT_CONTEXT_LENGTHS[key] == value, ( - f"{key} should be {value}, got {DEFAULT_CONTEXT_LENGTHS[key]}" - ) - def test_grok_substring_matching(self): # Longest-first substring matching must resolve the real xAI model # IDs to the correct fallback entries without 128k probe-down. @@ -268,13 +219,6 @@ class TestDefaultContextLengths: f"{model_id}: expected {expected_ctx}, got {actual}" ) - def test_all_values_positive(self): - for key, value in DEFAULT_CONTEXT_LENGTHS.items(): - assert value > 0, f"{key} has non-positive context length" - - def test_dict_is_not_empty(self): - assert len(DEFAULT_CONTEXT_LENGTHS) >= 10 - # ========================================================================= # Codex OAuth context-window resolution (provider="openai-codex") @@ -1141,12 +1085,6 @@ class TestContextProbeTiers: for i in range(len(CONTEXT_PROBE_TIERS) - 1): assert CONTEXT_PROBE_TIERS[i] > CONTEXT_PROBE_TIERS[i + 1] - def test_first_tier_is_256k(self): - assert CONTEXT_PROBE_TIERS[0] == 256_000 - - def test_last_tier_is_8k(self): - assert CONTEXT_PROBE_TIERS[-1] == 8_000 - class TestGetNextProbeTier: def test_from_256k(self): diff --git a/tests/agent/test_models_dev.py b/tests/agent/test_models_dev.py index 41fb4463ec8..b4bbbf753df 100644 --- a/tests/agent/test_models_dev.py +++ b/tests/agent/test_models_dev.py @@ -82,17 +82,6 @@ SAMPLE_REGISTRY = { class TestProviderMapping: - def test_all_mapped_providers_are_strings(self): - for hermes_id, mdev_id in PROVIDER_TO_MODELS_DEV.items(): - assert isinstance(hermes_id, str) - assert isinstance(mdev_id, str) - - def test_known_providers_mapped(self): - assert PROVIDER_TO_MODELS_DEV["anthropic"] == "anthropic" - assert PROVIDER_TO_MODELS_DEV["copilot"] == "github-copilot" - assert PROVIDER_TO_MODELS_DEV["stepfun"] == "stepfun" - assert PROVIDER_TO_MODELS_DEV["kilocode"] == "kilo" - def test_xai_oauth_uses_xai_catalog(self): assert PROVIDER_TO_MODELS_DEV["xai"] == "xai" assert PROVIDER_TO_MODELS_DEV["xai-oauth"] == "xai" diff --git a/tests/hermes_cli/test_copilot_in_model_list.py b/tests/hermes_cli/test_copilot_in_model_list.py index e414687bce7..83832b0c332 100644 --- a/tests/hermes_cli/test_copilot_in_model_list.py +++ b/tests/hermes_cli/test_copilot_in_model_list.py @@ -6,25 +6,6 @@ from unittest.mock import patch from hermes_cli.model_switch import list_authenticated_providers -@patch.dict(os.environ, {"GH_TOKEN": "test-key"}, clear=False) -def test_copilot_picker_keeps_curated_copilot_models_when_live_catalog_unavailable(): - with patch("agent.models_dev.fetch_models_dev", return_value={}), \ - patch("hermes_cli.models._resolve_copilot_catalog_api_key", return_value="gh-token"), \ - patch("hermes_cli.models._fetch_github_models", return_value=None): - providers = list_authenticated_providers(current_provider="openrouter", max_models=50) - - copilot = next((p for p in providers if p["slug"] == "copilot"), None) - - assert copilot is not None - assert "gpt-5.4" in copilot["models"] - assert "claude-sonnet-4.6" in copilot["models"] - assert "claude-sonnet-4" in copilot["models"] - assert "claude-sonnet-4.5" in copilot["models"] - assert "claude-haiku-4.5" in copilot["models"] - assert "gemini-3.1-pro-preview" in copilot["models"] - assert "claude-opus-4.6" not in copilot["models"] - - @patch.dict(os.environ, {"GH_TOKEN": "test-key"}, clear=False) def test_copilot_picker_uses_live_catalog_when_available(): live_models = ["gpt-5.4", "claude-sonnet-4.6", "gemini-3.1-pro-preview"] diff --git a/tests/hermes_cli/test_gmi_provider.py b/tests/hermes_cli/test_gmi_provider.py index 2c2f146ed85..86aaf699bf6 100644 --- a/tests/hermes_cli/test_gmi_provider.py +++ b/tests/hermes_cli/test_gmi_provider.py @@ -80,14 +80,6 @@ class TestGmiConfigRegistry: class TestGmiModelCatalog: - def test_static_model_fallback_exists(self): - assert "gmi" in _PROVIDER_MODELS - models = _PROVIDER_MODELS["gmi"] - assert "zai-org/GLM-5.1-FP8" in models - assert "deepseek-ai/DeepSeek-V3.2" in models - assert "moonshotai/Kimi-K2.5" in models - assert "anthropic/claude-sonnet-4.6" in models - def test_canonical_provider_entry(self): slugs = [p.slug for p in CANONICAL_PROVIDERS] assert "gmi" in slugs @@ -267,11 +259,6 @@ class TestGmiModelMetadata: class TestGmiAuxiliary: - def test_aux_default_model(self): - from agent.auxiliary_client import _get_aux_model_for_provider - - assert _get_aux_model_for_provider("gmi") == "google/gemini-3.1-flash-lite-preview" - def test_resolve_provider_client_uses_gmi_aux_default(self, monkeypatch): monkeypatch.setenv("GMI_API_KEY", "gmi-test-key") diff --git a/tests/hermes_cli/test_model_validation.py b/tests/hermes_cli/test_model_validation.py index 91fc4e50d00..89465b6c6c7 100644 --- a/tests/hermes_cli/test_model_validation.py +++ b/tests/hermes_cli/test_model_validation.py @@ -142,10 +142,6 @@ class TestCuratedModelsForProvider: assert len(models) > 0 assert any("claude" in m[0] for m in models) - def test_zai_returns_glm_models(self): - models = curated_models_for_provider("zai") - assert any("glm" in m[0] for m in models) - def test_unknown_provider_returns_empty(self): assert curated_models_for_provider("totally-unknown") == [] @@ -199,9 +195,6 @@ class TestProviderModelIds: def test_unknown_provider_returns_empty(self): assert provider_model_ids("some-unknown-provider") == [] - def test_zai_returns_glm_models(self): - assert "glm-5" in provider_model_ids("zai") - def test_stepfun_prefers_live_catalog(self): with patch( "hermes_cli.auth.resolve_api_key_provider_credentials", @@ -222,31 +215,6 @@ class TestProviderModelIds: patch("hermes_cli.models._fetch_github_models", return_value=["gpt-5.4", "claude-sonnet-4.6"]): assert provider_model_ids("copilot-acp") == ["gpt-5.4", "claude-sonnet-4.6"] - def test_copilot_falls_back_to_curated_defaults_without_stale_opus(self): - with patch("hermes_cli.models._resolve_copilot_catalog_api_key", return_value="gh-token"), \ - patch("hermes_cli.models._fetch_github_models", return_value=None): - ids = provider_model_ids("copilot") - - assert "gpt-5.4" in ids - assert "claude-sonnet-4.6" in ids - assert "claude-sonnet-4" in ids - assert "claude-sonnet-4.5" in ids - assert "claude-haiku-4.5" in ids - assert "gemini-3.1-pro-preview" in ids - assert "claude-opus-4.6" not in ids - - def test_copilot_acp_falls_back_to_copilot_defaults(self): - with patch("hermes_cli.models._resolve_copilot_catalog_api_key", return_value="gh-token"), \ - patch("hermes_cli.models._fetch_github_models", return_value=None): - ids = provider_model_ids("copilot-acp") - - assert "gpt-5.4" in ids - assert "claude-sonnet-4.6" in ids - assert "claude-sonnet-4" in ids - assert "gemini-3.1-pro-preview" in ids - assert "copilot-acp" not in ids - assert "claude-opus-4.6" not in ids - # -- fetch_api_models -------------------------------------------------------- diff --git a/tests/hermes_cli/test_models.py b/tests/hermes_cli/test_models.py index db96a6558d7..f965f361dec 100644 --- a/tests/hermes_cli/test_models.py +++ b/tests/hermes_cli/test_models.py @@ -56,10 +56,6 @@ class TestOpenRouterModels: assert isinstance(mid, str) and len(mid) > 0 assert isinstance(desc, str) - def test_at_least_5_models(self): - """Sanity check that the models list hasn't been accidentally truncated.""" - assert len(OPENROUTER_MODELS) >= 5 - class TestFetchOpenRouterModels: def test_live_fetch_recomputes_free_tags(self, monkeypatch): diff --git a/tests/hermes_cli/test_ollama_cloud_provider.py b/tests/hermes_cli/test_ollama_cloud_provider.py index e62aa899ff8..ad7e3a0b9d9 100644 --- a/tests/hermes_cli/test_ollama_cloud_provider.py +++ b/tests/hermes_cli/test_ollama_cloud_provider.py @@ -495,12 +495,3 @@ class TestOllamaCloudSuffixStripping: assert _strip_ollama_cloud_suffix("qwen3-coder:480b-cloud") == "qwen3-coder:480b" assert _strip_ollama_cloud_suffix("nemotron-3-nano:30b") == "nemotron-3-nano:30b" assert _strip_ollama_cloud_suffix("") == "" - - -# ── Auxiliary Model ── - -class TestOllamaCloudAuxiliary: - def test_aux_model_defined(self): - from agent.auxiliary_client import _API_KEY_PROVIDER_AUX_MODELS - assert "ollama-cloud" in _API_KEY_PROVIDER_AUX_MODELS - assert _API_KEY_PROVIDER_AUX_MODELS["ollama-cloud"] == "nemotron-3-nano:30b" From 5a72e82fd8175597a82d4599ae35d20b1fb8fc89 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Fri, 29 May 2026 21:28:12 +0530 Subject: [PATCH 20/89] feat(tui): nudge toward /agents dashboard when delegation starts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TUI already ships a rich /agents spawn-tree dashboard (live tree, timeline, per-child tokens/cost/files/tools, kill/pause), but nothing surfaced it — during delegation the transcript stayed quiet and users had to already know to type /agents. Drop a one-time transient activity hint ("subagents working · /agents to watch live") the first time a turn starts delegating, matching the existing "· /logs to inspect" house style. Guards keep it unobtrusive: - fires at most once per turn (resets on message.start) - silent when the /agents overlay is already open - gated by display.tui_agents_nudge (default true) Hooked on subagent.start, not subagent.spawn_requested: the delegate progress callback in tools/delegate_tool.py only relays start/complete to the gateway and drops spawn_requested, so start is the first delegation event the TUI reliably receives. spawn_requested is wired too for the future case, guarded once-per-turn. Adds the display.tui_agents_nudge config default and gatewayTypes entry. --- hermes_cli/config.py | 5 + .../createGatewayEventHandler.test.ts | 113 +++++++++++++++++- ui-tui/src/app/createGatewayEventHandler.ts | 88 +++++++++++++- ui-tui/src/gatewayTypes.ts | 6 + 4 files changed, 208 insertions(+), 4 deletions(-) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index e2c59a694fe..ff473c23549 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1183,6 +1183,11 @@ DEFAULT_CONFIG = { # Mirrors `hermes -c` muscle memory. Default off so existing # users aren't surprised. HERMES_TUI_RESUME= always wins. "tui_auto_resume_recent": False, + # When true (default), `hermes --tui` drops a one-time hint + # ("subagents working · /agents to watch live") the first time a turn + # starts delegating, nudging the user toward the live spawn-tree + # dashboard. Set false to suppress the hint. + "tui_agents_nudge": True, "bell_on_complete": False, "show_reasoning": False, "streaming": False, diff --git a/ui-tui/src/__tests__/createGatewayEventHandler.test.ts b/ui-tui/src/__tests__/createGatewayEventHandler.test.ts index 0a3e4227396..897875b2c03 100644 --- a/ui-tui/src/__tests__/createGatewayEventHandler.test.ts +++ b/ui-tui/src/__tests__/createGatewayEventHandler.test.ts @@ -1,7 +1,7 @@ import { beforeEach, describe, expect, it, vi } from 'vitest' import { createGatewayEventHandler } from '../app/createGatewayEventHandler.js' -import { getOverlayState, resetOverlayState } from '../app/overlayStore.js' +import { getOverlayState, patchOverlayState, resetOverlayState } from '../app/overlayStore.js' import { turnController } from '../app/turnController.js' import { getTurnState, resetTurnState } from '../app/turnStore.js' import { getUiState, patchUiState, resetUiState } from '../app/uiStore.js' @@ -897,6 +897,117 @@ describe('createGatewayEventHandler', () => { expect(getTurnState().subagents.find(s => s.id === 'sa-weird')?.status).toBe('completed') }) + it('nudges toward /agents on the first spawn_requested of a turn', () => { + const appended: Msg[] = [] + const onEvent = createGatewayEventHandler(buildCtx(appended)) + + onEvent({ + payload: { goal: 'child a', subagent_id: 'sa-a', task_index: 0 }, + type: 'subagent.spawn_requested' + } as any) + + const hints = getTurnState().activity.filter(a => a.text.includes('/agents')) + expect(hints).toHaveLength(1) + expect(hints[0]).toMatchObject({ tone: 'info' }) + }) + + it('nudges toward /agents on subagent.start (spawn_requested dropped in CLI path)', () => { + const appended: Msg[] = [] + const onEvent = createGatewayEventHandler(buildCtx(appended)) + + // In the real CLI→gateway path the delegate callback drops + // spawn_requested, so `start` is the first event the TUI sees. + onEvent({ + payload: { goal: 'child a', subagent_id: 'sa-a', task_index: 0 }, + type: 'subagent.start' + } as any) + + expect(getTurnState().activity.filter(a => a.text.includes('/agents'))).toHaveLength(1) + }) + + it('nudges at most once per turn and resets on the next message.start', () => { + const appended: Msg[] = [] + const onEvent = createGatewayEventHandler(buildCtx(appended)) + + // Multiple spawns in one turn → a single hint. + onEvent({ + payload: { goal: 'child a', subagent_id: 'sa-a', task_index: 0 }, + type: 'subagent.start' + } as any) + onEvent({ + payload: { goal: 'child b', subagent_id: 'sa-b', task_index: 1 }, + type: 'subagent.start' + } as any) + expect(getTurnState().activity.filter(a => a.text.includes('/agents'))).toHaveLength(1) + + // New turn clears activity AND the once-per-turn guard → nudges again. + onEvent({ payload: {}, type: 'message.start' } as any) + onEvent({ + payload: { goal: 'child c', subagent_id: 'sa-c', task_index: 0 }, + type: 'subagent.start' + } as any) + expect(getTurnState().activity.filter(a => a.text.includes('/agents'))).toHaveLength(1) + }) + + it('does not nudge when the /agents overlay is already open', () => { + const appended: Msg[] = [] + const onEvent = createGatewayEventHandler(buildCtx(appended)) + + // User already has the dashboard open → nothing to advertise. + patchOverlayState({ agents: true }) + + onEvent({ + payload: { goal: 'child a', subagent_id: 'sa-a', task_index: 0 }, + type: 'subagent.start' + } as any) + + expect(getTurnState().activity.filter(a => a.text.includes('/agents'))).toHaveLength(0) + }) + + it('nudges if the /agents overlay is closed mid-turn while delegation continues', () => { + const appended: Msg[] = [] + const onEvent = createGatewayEventHandler(buildCtx(appended)) + + // Overlay open on the first delegation event → suppressed, but the + // turn's nudge credit must NOT be burned (the user is watching). + patchOverlayState({ agents: true }) + onEvent({ + payload: { goal: 'child a', subagent_id: 'sa-a', task_index: 0 }, + type: 'subagent.start' + } as any) + expect(getTurnState().activity.filter(a => a.text.includes('/agents'))).toHaveLength(0) + + // User closes the dashboard mid-turn → the next delegation event nudges. + patchOverlayState({ agents: false }) + onEvent({ + payload: { goal: 'child b', subagent_id: 'sa-b', task_index: 1 }, + type: 'subagent.start' + } as any) + expect(getTurnState().activity.filter(a => a.text.includes('/agents'))).toHaveLength(1) + }) + + it('does not nudge when display.tui_agents_nudge is false', async () => { + const appended: Msg[] = [] + const ctx = buildCtx(appended) + // config.get → full returns the disable flag. + ctx.gateway.rpc = vi.fn(async (method: string) => + method === 'config.get' ? { config: { display: { tui_agents_nudge: false } } } : null + ) + const onEvent = createGatewayEventHandler(ctx) + + // Eager config fetch fires at creation; let it resolve before any spawn + // (mirrors real usage — config lands well before the first delegation). + await Promise.resolve() + await Promise.resolve() + + onEvent({ + payload: { goal: 'child a', subagent_id: 'sa-a', task_index: 0 }, + type: 'subagent.start' + } as any) + + expect(getTurnState().activity.filter(a => a.text.includes('/agents'))).toHaveLength(0) + }) + it('drops stale reasoning/tool/todos events after ctrl-c until the next message starts', () => { // Repro for the discord report: ctrl-c interrupts, but late reasoning/tool // events from the still-winding-down agent loop kept populating the UI for diff --git a/ui-tui/src/app/createGatewayEventHandler.ts b/ui-tui/src/app/createGatewayEventHandler.ts index 26d6cfacd0c..70264b0c7f9 100644 --- a/ui-tui/src/app/createGatewayEventHandler.ts +++ b/ui-tui/src/app/createGatewayEventHandler.ts @@ -17,7 +17,7 @@ import type { Msg, SubagentProgress, SubagentStatus } from '../types.js' import { applyDelegationStatus, getDelegationState } from './delegationStore.js' import type { GatewayEventHandlerContext } from './interfaces.js' -import { patchOverlayState } from './overlayStore.js' +import { getOverlayState, patchOverlayState } from './overlayStore.js' import { turnController } from './turnController.js' import { getUiState, patchUiState } from './uiStore.js' @@ -123,6 +123,78 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev: // render a /warning close to the configured cap without spamming the RPC. let lastDelegationFetchAt = 0 + // ── Shared full-config read ────────────────────────────────────────── + // + // Several concerns need `display.*` flags at startup (the /agents nudge + // gate below, the auto-resume check in the `gateway.ready` handler). + // Memoize the `config.get full` RPC so we make exactly one round-trip + // instead of one per concern. Resolves to null on RPC failure; callers + // treat null as "use defaults". + let fullConfigPromise: null | Promise = null + + const getFullConfigOnce = (): Promise => { + fullConfigPromise ??= rpc('config.get', { key: 'full' }).catch(() => null) + + return fullConfigPromise + } + + // ── Nudge toward /agents on delegation ─────────────────────────────── + // + // When `display.tui_agents_nudge` is enabled (default true), the first + // time a turn starts delegating we drop a single transient activity hint + // ("subagents working · /agents to watch live") so the user discovers the + // spawn-tree dashboard instead of staring at a quiet transcript — without + // hijacking the screen by force-opening an overlay. Guards: + // • fires at most once per turn (`agentsNudgedThisTurn`) + // • silent if the overlay is already open (nothing to advertise) + // Reset on `message.start`. The config flag is fetched once, lazily; + // until it resolves we assume the default (on). + let agentsNudgeEnabled = true + let agentsNudgeConfigFetched = false + let agentsNudgedThisTurn = false + + const ensureAgentsNudgeConfig = () => { + if (agentsNudgeConfigFetched) { + return + } + + agentsNudgeConfigFetched = true + getFullConfigOnce().then(cfg => { + // Only an explicit `false` disables it; absent/unknown keeps default on. + if (cfg?.config?.display?.tui_agents_nudge === false) { + agentsNudgeEnabled = false + } + }) + } + + const maybeNudgeAgents = () => { + ensureAgentsNudgeConfig() + + if (!agentsNudgeEnabled || agentsNudgedThisTurn) { + return + } + + // Already watching → no point advertising the dashboard. Don't burn the + // turn's nudge credit here: if the user closes the overlay later in the + // same turn while delegation is still ongoing, a subsequent event should + // still be allowed to nudge. The flag is only set once we actually push. + if (getOverlayState().agents) { + return + } + + agentsNudgedThisTurn = true + turnController.pushActivity('subagents working · /agents to watch live', 'info') + } + + const resetAgentsNudgeTurnState = () => { + agentsNudgedThisTurn = false + } + + // Kick off the config fetch eagerly at handler creation so the flag is + // resolved well before the first delegation of any real session (which + // only happens after gateway.ready + a user turn). + ensureAgentsNudgeConfig() + const refreshDelegationStatus = (force = false) => { const now = Date.now() @@ -244,8 +316,8 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev: // forging a brand-new one. Mirrors classic CLI's `hermes -c` / // `hermes --tui` muscle memory and addresses the audit's "session // unrecoverable after disconnection" gap. Default off so existing - // users aren't surprised. - rpc('config.get', { key: 'full' }) + // users aren't surprised. (Shares the memoized full-config read.) + getFullConfigOnce() .then(cfg => { if (!cfg?.config?.display?.tui_auto_resume_recent) { patchUiState({ status: 'forging session…' }) @@ -332,6 +404,7 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev: } case 'message.start': + resetAgentsNudgeTurnState() turnController.startMessage() return @@ -618,6 +691,9 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev: // Preserve completed state if a later event races in before this one. turnController.upsertSubagent(ev.payload, c => (isTerminalStatus(c.status) ? {} : { status: 'queued' })) + // First sign of delegation this turn → nudge toward /agents. + maybeNudgeAgents() + // Prime the status-bar HUD: fetch caps (once every 5s) so we can // warn as depth/concurrency approaches the configured ceiling. if (getDelegationState().maxSpawnDepth === null) { @@ -631,6 +707,12 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev: case 'subagent.start': turnController.upsertSubagent(ev.payload, c => (isTerminalStatus(c.status) ? {} : { status: 'running' })) + // `subagent.start` is the first delegation event the TUI reliably + // receives (the delegate callback drops `spawn_requested` in the + // CLI→gateway path), so nudge here too. Once-per-turn guarded, so + // hooking both events is safe. + maybeNudgeAgents() + return case 'subagent.thinking': { const text = String(ev.payload.text ?? '').trim() diff --git a/ui-tui/src/gatewayTypes.ts b/ui-tui/src/gatewayTypes.ts index ae1f38e9b38..447dec3ea49 100644 --- a/ui-tui/src/gatewayTypes.ts +++ b/ui-tui/src/gatewayTypes.ts @@ -62,6 +62,12 @@ export interface ConfigDisplayConfig { show_reasoning?: boolean streaming?: boolean thinking_mode?: string + /** + * Nudge the user toward the /agents spawn-tree dashboard the first time a + * turn starts delegating, via a one-time transient activity hint. Opens + * nothing — just advertises the command. Default true. + */ + tui_agents_nudge?: boolean tui_auto_resume_recent?: boolean tui_compact?: boolean /** Legacy alias for display.mouse_tracking. */ From c1b2d0917fff3ff68064757c229cef8d717aa4e0 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 30 May 2026 00:22:46 -0700 Subject: [PATCH 21/89] fix(cli): don't treat any container as the Docker image for updates (#35139) detect_install_method() returned "docker" for any container (is_container()), before the .git check. Both supported installs already self-identify via the .install_method stamp read first: the curl installer (scripts/install.sh) git-clones and stamps "git"; the published nousresearch/hermes-agent image stamps "docker" at boot via docker/stage2-hook.sh. An unsupported manual install dropped into a container has no stamp, so the bare container check hijacked it to "docker" and 'hermes update' bailed with the docker-pull guidance. Drop the redundant is_container() -> docker fallback. Unstamped installs now fall through to the .git/pip checks like any off-path install; both supported paths are unaffected because the stamp wins first. Fixes #34397. --- hermes_cli/config.py | 22 ++++++++++++----- .../hermes_cli/test_pip_install_detection.py | 24 +++++++++++++++++-- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 0e7a9e6ade2..a24af13aafc 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -285,9 +285,22 @@ def detect_install_method(project_root: Optional[Path] = None) -> str: Resolution order: 1. Stamped ``~/.hermes/.install_method`` file (written by installers) 2. HERMES_MANAGED env / .managed marker (NixOS, Homebrew) - 3. Container detection (/.dockerenv, /run/.containerenv, cgroup) - 4. .git directory presence -> 'git' - 5. Fallback -> 'pip' + 3. .git directory presence -> 'git' + 4. Fallback -> 'pip' + + Note: running inside a container is NOT treated as "docker" on its own. + The two supported install paths both self-identify via the + ``.install_method`` stamp (caught by step 1), so neither relies on + container detection here: + - the curl installer (scripts/install.sh, the README/website install + command) git-clones the repo and stamps ``git``; + - the published ``nousresearch/hermes-agent`` image stamps ``docker`` + at boot via ``docker/stage2-hook.sh``. + An unsupported manual install dropped into a container (no stamp) was + wrongly classified as the published image by bare container detection, + so ``hermes update`` bailed with "doesn't apply inside the Docker + container". Without that fallback such installs fall through to the + ``.git``/pip checks and behave like any off-path install. See issue #34397. """ stamp = get_hermes_home() / ".install_method" try: @@ -299,9 +312,6 @@ def detect_install_method(project_root: Optional[Path] = None) -> str: managed = get_managed_system() if managed: return managed.lower().replace(" ", "-") - from hermes_constants import is_container - if is_container(): - return "docker" if project_root is None: project_root = Path(__file__).parent.parent.resolve() if (project_root / ".git").is_dir(): diff --git a/tests/hermes_cli/test_pip_install_detection.py b/tests/hermes_cli/test_pip_install_detection.py index 49df74f626e..eb06e35f2bf 100644 --- a/tests/hermes_cli/test_pip_install_detection.py +++ b/tests/hermes_cli/test_pip_install_detection.py @@ -48,12 +48,32 @@ def test_stamp_file_takes_precedence(tmp_path): assert detect_install_method(project_root=tmp_path) == "docker" -def test_docker_detected_via_dockerenv(tmp_path): +def test_container_without_stamp_is_not_docker(tmp_path): + """An unstamped install in a generic container must NOT be flagged as docker. + + Regression for issue #34397. The two supported installs both stamp + ``.install_method`` (the curl installer -> ``git``, covered by + ``test_stamp_file_takes_precedence``; the published image -> ``docker``), + so neither hits this path. An unsupported manual install dropped into a + container has no stamp and was wrongly classified as the published Docker + image, so ``hermes update`` refused to run. With a ``.git`` checkout it + must resolve to ``git``. + """ + (tmp_path / ".git").mkdir() with patch("hermes_cli.config.get_managed_system", return_value=None), \ patch("hermes_cli.config.get_hermes_home", return_value=tmp_path), \ patch("hermes_constants.is_container", return_value=True): from hermes_cli.config import detect_install_method - assert detect_install_method(project_root=tmp_path) == "docker" + assert detect_install_method(project_root=tmp_path) == "git" + + +def test_container_pip_install_without_stamp_is_pip(tmp_path): + """Container + no .git + no stamp -> pip, not docker (issue #34397).""" + with patch("hermes_cli.config.get_managed_system", return_value=None), \ + patch("hermes_cli.config.get_hermes_home", return_value=tmp_path), \ + patch("hermes_constants.is_container", return_value=True): + from hermes_cli.config import detect_install_method + assert detect_install_method(project_root=tmp_path) == "pip" def test_recommended_update_command_docker(): From 636ff636d7d819503035b87655d2c7247e84def7 Mon Sep 17 00:00:00 2001 From: Max Hsu Date: Fri, 29 May 2026 14:54:11 +0800 Subject: [PATCH 22/89] fix(agent): strip schema-foreign keys from max-iterations summary request (#34436) The max-iterations summary path (`handle_max_iterations`) hand-builds its message list and calls `chat.completions.create()` directly, bypassing `ChatCompletionsTransport.convert_messages()`. It only popped ("reasoning", "finish_reason", "_thinking_prefill"), so `tool_name` (SQLite FTS bookkeeping), the `codex_*` reasoning carriers, and other internal `_`-prefixed scaffolding leaked to the wire. Strict OpenAI-compatible gateways (Fireworks-backed OpenCode Go, Mistral, Moonshot/Kimi) reject these with HTTP 400 "Extra inputs are not permitted, field: 'messages[N].tool_name'", so a long tool-using session that exhausts the iteration budget fails to summarise instead of returning the result. Mirror convert_messages() in this path: also drop tool_name, codex_reasoning_items, codex_message_items, and every `_`-prefixed key. Copy-on-write is already in place, so internal history keeps the fields for FTS / Codex-fallback. Adds a regression test to TestHandleMaxIterations asserting the summary request carries none of the schema-foreign keys (fails on main, passes here). --- agent/chat_completion_helpers.py | 12 +++++++++++ tests/run_agent/test_run_agent.py | 34 +++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py index 0785347d2c9..cc7427950b2 100644 --- a/agent/chat_completion_helpers.py +++ b/agent/chat_completion_helpers.py @@ -1283,6 +1283,18 @@ def handle_max_iterations(agent, messages: list, api_call_count: int) -> str: agent._copy_reasoning_content_for_api(msg, api_msg) for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"): api_msg.pop(internal_field, None) + # Strict OpenAI-compatible gateways (Fireworks-backed OpenCode Go, + # Mistral, Moonshot/Kimi) reject any message key outside the Chat + # Completions schema. The main loop drops these via + # ChatCompletionsTransport.convert_messages(), but the summary path + # hand-builds messages and calls chat.completions.create() directly, + # bypassing the transport — so mirror that sanitization here: + # tool_name (SQLite FTS bookkeeping), the codex_* reasoning carriers, + # and every Hermes-internal underscore-prefixed scaffolding key. + for schema_foreign in ("tool_name", "codex_reasoning_items", "codex_message_items"): + api_msg.pop(schema_foreign, None) + for internal_key in [k for k in api_msg if isinstance(k, str) and k.startswith("_")]: + api_msg.pop(internal_key, None) if _needs_sanitize: agent._sanitize_tool_calls_for_strict_api(api_msg) api_messages.append(api_msg) diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index f5112824a7a..1653dc0d4ad 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -2756,6 +2756,40 @@ class TestHandleMaxIterations: ] assert len(stub_ids) >= 1, f"No stub result for assistant tool_call: {stub_ids}" + def test_summary_strips_strict_schema_foreign_fields(self, agent): + """Regression: the max-iterations summary request must NOT carry + Chat-Completions-schema-foreign keys — tool_name (SQLite FTS + bookkeeping), codex_* reasoning carriers, or internal _-prefixed + scaffolding. Strict gateways (Fireworks-backed OpenCode Go, Mistral, + Kimi) reject these with 'Extra inputs are not permitted, field: + messages[N].tool_name'. The transport's convert_messages() strips + them on the main loop; this hand-built summary path must mirror it.""" + agent.client.chat.completions.create.return_value = _mock_response(content="Summary") + agent._cached_system_prompt = "You are helpful." + messages = [ + {"role": "user", "content": "do stuff"}, + { + "role": "assistant", + "tool_calls": [{"id": "call_1", "function": {"name": "execute_code", "arguments": "{}"}}], + "codex_reasoning_items": [{"id": "rs_1"}], + }, + {"role": "tool", "tool_call_id": "call_1", "content": "result", "tool_name": "execute_code"}, + {"role": "assistant", "content": "Done.", "_empty_recovery_synthetic": True}, + ] + + result = agent._handle_max_iterations(messages, 60) + + assert result == "Summary" + sent_msgs = agent.client.chat.completions.create.call_args.kwargs.get("messages", []) + for m in sent_msgs: + assert "tool_name" not in m, m + assert "codex_reasoning_items" not in m, m + assert "codex_message_items" not in m, m + assert not any(isinstance(k, str) and k.startswith("_") for k in m), m + # Internal history is untouched — the path copies each message. + assert messages[2]["tool_name"] == "execute_code" + assert messages[1]["codex_reasoning_items"] == [{"id": "rs_1"}] + def test_summary_omits_provider_preferences_for_non_openrouter(self, agent): agent.base_url = "https://api.openai.com/v1" agent._base_url_lower = agent.base_url.lower() From e8076c1ebe659c58284396d88f802537ffc2ccb8 Mon Sep 17 00:00:00 2001 From: SeaXen Date: Wed, 27 May 2026 14:21:22 +0000 Subject: [PATCH 23/89] fix(dashboard): allow chat websockets on insecure public bind Allow non-loopback websocket peers when the dashboard is explicitly exposed with --host 0.0.0.0/:: and --insecure. This fixes the failure mode where /chat rendered over LAN but /api/ws and /api/events were rejected with HTTP 403, leaving the embedded TUI chat disconnected. Add regression coverage for the insecure public bind case in the dashboard websocket auth tests. --- hermes_cli/web_server.py | 10 +++++- .../hermes_cli/test_dashboard_auth_ws_auth.py | 34 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 58e0d59908b..70a87e1969c 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -3371,10 +3371,15 @@ _LOOPBACK_HOSTS = frozenset({"127.0.0.1", "::1", "localhost", "testclient"}) def _ws_client_is_allowed(ws: "WebSocket") -> bool: """Check if the WebSocket client IP is acceptable. - Loopback mode: only loopback clients allowed — the legacy + Loopback bind: only loopback clients allowed — the legacy ``?token=<_SESSION_TOKEN>`` path is the only auth we have, so we don't want LAN hosts guessing tokens. + All-interfaces insecure bind (``--host 0.0.0.0 --insecure`` or + ``--host :: --insecure``): allow any peer. The operator explicitly + opted into LAN/public exposure in this mode, so the loopback-only peer + restriction should not apply. + Gated mode: any peer is allowed — uvicorn's ``proxy_headers=True`` (enabled when the OAuth gate is active so cookies can pick up ``X-Forwarded-Proto``) rewrites ``ws.client.host`` to the @@ -3385,6 +3390,9 @@ def _ws_client_is_allowed(ws: "WebSocket") -> bool: """ if getattr(app.state, "auth_required", False): return True + bound_host = getattr(app.state, "bound_host", "") + if bound_host in {"0.0.0.0", "::"}: + return True client_host = ws.client.host if ws.client else "" if not client_host: return True diff --git a/tests/hermes_cli/test_dashboard_auth_ws_auth.py b/tests/hermes_cli/test_dashboard_auth_ws_auth.py index 0ebed6d9519..e07e5e3c4f6 100644 --- a/tests/hermes_cli/test_dashboard_auth_ws_auth.py +++ b/tests/hermes_cli/test_dashboard_auth_ws_auth.py @@ -80,6 +80,25 @@ def loopback_app(): web_server.app.state.auth_required = prev_required +@pytest.fixture +def insecure_public_app(): + """web_server.app configured for all-interfaces insecure mode.""" + _reset_for_tests() + clear_providers() + prev_host = getattr(web_server.app.state, "bound_host", None) + prev_port = getattr(web_server.app.state, "bound_port", None) + prev_required = getattr(web_server.app.state, "auth_required", None) + web_server.app.state.bound_host = "0.0.0.0" + web_server.app.state.bound_port = 9120 + web_server.app.state.auth_required = False + client = TestClient(web_server.app, base_url="http://192.168.0.222:9120") + yield client + _reset_for_tests() + web_server.app.state.bound_host = prev_host + web_server.app.state.bound_port = prev_port + web_server.app.state.auth_required = prev_required + + def _logged_in(client: TestClient) -> None: """Drive the stub OAuth round trip so the client holds session cookies.""" r1 = client.get("/auth/login?provider=stub", follow_redirects=False) @@ -281,6 +300,21 @@ class TestWsRequestIsAllowedGated: ws.headers = {"host": "127.0.0.1:8080"} assert web_server._ws_request_is_allowed(ws) is True + def test_non_loopback_peer_allowed_in_insecure_public_mode(self, insecure_public_app): + """`--host 0.0.0.0 --insecure` is an explicit LAN/public opt-in. + + Regression coverage for the dashboard `/chat` breakage where the + HTML shell loaded on 9120 but every WebSocket upgrade was rejected + with 403 because the loopback-only peer guard still ran even though + the operator intentionally exposed the dashboard on all interfaces. + """ + ws = _fake_ws(query={}, client_host="192.168.0.55") + ws.headers = { + "host": "192.168.0.222:9120", + "origin": "http://192.168.0.222:9120", + } + assert web_server._ws_request_is_allowed(ws) is True + def test_host_origin_guard_still_runs_in_gated_mode(self, gated_app): """Bypassing the peer-IP check must not bypass the DNS-rebinding Host header guard — that one still protects against attacker From 17103a1f118022a2836cedd89eb3d6a7af4f79ea Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 21:19:19 -0700 Subject: [PATCH 24/89] chore: add SeaXen to AUTHOR_MAP for salvaged PR #33278 --- scripts/release.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/release.py b/scripts/release.py index 9b7e12a5d1f..9b05ea72190 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json" # Auto-extracted from noreply emails + manual overrides AUTHOR_MAP = { + "drpelagik@gmail.com": "SeaXen", "metalclaudbot@gmail.com": "HashClawAI", "tonybear55665566@gmail.com": "TonyPepeBear", "kaspersniels@gmail.com": "nielskaspers", From 84ee80eb5d94838dd5b2c3c74a0fbe53dfb48c28 Mon Sep 17 00:00:00 2001 From: weichengxu Date: Fri, 29 May 2026 21:27:29 -0700 Subject: [PATCH 25/89] feat: set process title to 'hermes' in ps/top/htop Adds _set_process_title() in hermes_cli/main.py, called first thing in main(). Tries setproctitle (optional) for a full ps-args rewrite, then falls back to ctypes prctl(PR_SET_NAME) on Linux / pthread_setname_np on macOS. No-op on Windows and on any failure. No new dependency: the setproctitle path is best-effort via ImportError guard. Fixes #35108 --- hermes_cli/main.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 79dd50c23b0..93ca26e90e6 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -65,6 +65,46 @@ import os import sys +def _set_process_title() -> None: + """Set the process title to 'hermes' so tools like 'ps', 'top', and + 'htop' show the app name instead of 'python3.xx'. + + Purely cosmetic — non-fatal on any platform. + + Strategy (try in order): + 1. ``setproctitle`` (opt-in dep — installed via ``hermes tools`` or + ``pip install setproctitle``, or bundled in a future release). + 2. ctypes ``prctl(PR_SET_NAME)`` (Linux only, 15-char limit). + 3. ctypes ``pthread_setname_np`` (macOS only, kernel thread name — + changes lldb/top but not ``ps aux``). + 4. No-op on Windows (the .exe name is already ``hermes.exe``). + """ + # Strategy 1: setproctitle (best — works on macOS, Linux, BSD) + try: + import setproctitle # type: ignore[import-untyped] + + setproctitle.setproctitle("hermes") + return + except ImportError: + pass + + # Strategy 2/3: platform-specific ctypes fallback + import ctypes + import platform + + try: + system = platform.system() + if system == "Linux": + libc = ctypes.CDLL("libc.so.6", use_errno=True) + libc.prctl(15, b"hermes", 0, 0, 0) # PR_SET_NAME = 15 + elif system == "Darwin": + libc = ctypes.CDLL("libc.dylib", use_errno=True) + libc.pthread_setname_np(b"hermes") + # Windows: the .exe name is already ``hermes.exe`` — nothing to do. + except Exception: + pass + + # Mouse-tracking residue suppression — runs BEFORE every other import on the # TUI hot path so the terminal stops emitting SGR/X10 mouse reports while the # Python launcher is still doing imports (≈100–300ms in cooked + echo mode, @@ -11276,6 +11316,10 @@ def _try_termux_fast_tui_launch() -> bool: def main(): """Main entry point for hermes CLI.""" + # Cosmetic: make the process show up as 'hermes' instead of 'python3.11' + # in ps/top/htop. Non-fatal — just a nicer UX. + _set_process_title() + # Force UTF-8 stdio on Windows before anything prints. No-op elsewhere. try: from hermes_cli.stdio import configure_windows_stdio From e5765e61fa68b7fa6aebd01ef1a2a79c7af80f82 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 21:27:29 -0700 Subject: [PATCH 26/89] chore(release): map wei.chen.coder@gmail.com -> wenchengxucool --- scripts/release.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/release.py b/scripts/release.py index 9b05ea72190..7cc6a94d0ee 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -68,6 +68,7 @@ AUTHOR_MAP = { "wangpuv@hotmail.com": "wangpuv", "202622897+ticketclosed-wontfix@users.noreply.github.com": "ticketclosed-wontfix", "wuxuebin1993@gmail.com": "victorGPT", + "wei.chen.coder@gmail.com": "wenchengxucool", "frowte3k@gmail.com": "Frowtek", "211828103+julio-cloudvisor@users.noreply.github.com": "julio-cloudvisor", "17778+kweiner@users.noreply.github.com": "kweiner", From bb79bcde6103c564dacb2d796fe8fe8b775f1b18 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 30 May 2026 00:32:05 -0700 Subject: [PATCH 27/89] fix: detect pyproject.toml / __init__.py version drift in hermes doctor (#35142) A git conflict resolution (reset --hard or merge) can revert hermes_cli/__init__.py to a stale __version__ while pyproject.toml stays current, so 'hermes --version' silently reports the wrong version. Nothing cross-checked the two files. Add a version-consistency check to the doctor 'Python Environment' section: reads the [project] version from pyproject.toml and compares it to hermes_cli.__version__. Reports OK when they match, fails with a re-sync hint when they drift, and is a silent no-op for installed wheels where pyproject.toml isn't present. Closes #35070 --- hermes_cli/doctor.py | 58 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/hermes_cli/doctor.py b/hermes_cli/doctor.py index 3db70beaa72..4971f1faece 100644 --- a/hermes_cli/doctor.py +++ b/hermes_cli/doctor.py @@ -204,6 +204,60 @@ def _fail_and_issue(text: str, detail: str, fix: str, issues: list[str]) -> None issues.append(fix) +def _read_pyproject_version() -> str | None: + """Read the ``version = "..."`` from ``pyproject.toml`` at the project root. + + Returns None when running from an installed wheel (no pyproject.toml ships + with the package) or when the file can't be parsed. Reads only the + ``[project]`` version, ignoring any version strings that appear in other + tables. + """ + pyproject = PROJECT_ROOT / "pyproject.toml" + try: + text = pyproject.read_text(encoding="utf-8") + except OSError: + return None + in_project = False + for raw in text.splitlines(): + line = raw.strip() + if line.startswith("[") and line.endswith("]"): + in_project = line == "[project]" + continue + if in_project and line.startswith("version") and "=" in line: + value = line.split("=", 1)[1] + value = value.split("#", 1)[0].strip().strip("\"'") + return value or None + return None + + +def _check_version_consistency(issues: list[str]) -> None: + """Verify pyproject.toml version matches hermes_cli.__version__. + + A git conflict resolution (reset/merge) can revert one file without the + other, leaving ``hermes --version`` reporting a stale version while + ``pyproject.toml`` is current. Detect that drift so users can re-sync. + Silent no-op for installed wheels where pyproject.toml isn't present. + """ + try: + from hermes_cli import __version__ as init_version + except Exception: + return + pyproject_version = _read_pyproject_version() + if pyproject_version is None: + # Installed wheel or unreadable pyproject — nothing to cross-check. + return + if pyproject_version == init_version: + check_ok("Version files consistent", f"({init_version})") + else: + _fail_and_issue( + "Version mismatch between source files", + f"(pyproject.toml {pyproject_version} != hermes_cli/__init__.py {init_version})", + "Re-sync version files (e.g. run 'hermes update', or set " + "hermes_cli/__init__.py __version__ to match pyproject.toml)", + issues, + ) + + def _check_s6_supervision(issues: list[str]) -> None: """Inside a container under our s6 /init, surface what s6 sees. @@ -509,6 +563,10 @@ def run_doctor(args): check_ok("Virtual environment active") else: check_warn("Not in virtual environment", "(recommended)") + + # Detect drift between pyproject.toml and hermes_cli/__init__.py versions + # (a git conflict resolution can silently revert one but not the other). + _check_version_consistency(issues) _section("Required Packages") required_packages = [ From 9d2571c86a7dae2bb526ca22233fe5309c23d53d Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Sat, 30 May 2026 12:54:41 +0530 Subject: [PATCH 28/89] fix: surface /agents nudge while delegate_task is in-flight (TUI + CLI) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The subagent spawn-observability overlay added a `(/agents)` hint, but only on the standalone "Spawn tree" panel, gated behind `!inlineDelegateKey` — it never showed for a single delegate_task call, and only appeared once subagents had already registered. A nudge that arrives at the end (or only after spawn) is useless for the actual goal: letting users open the live monitor *while* delegation is running. Surface it the moment delegation starts, on both surfaces: TUI (ui-tui/src/components/thinking.tsx) - Show `(/agents)` on any "Delegate Task" tool group as soon as it appears (in-flight, before any subagent registers), not gated on subagents already existing. Same `startsWith('Delegate Task')` predicate already used for delegateGroups. CLI (agent/tool_executor.py) - Append `· /agents to monitor` to the delegate spinner label, which is displayed for the full duration of the delegate_task call. The previous attempt put the hint on the completion line (get_cute_tool_message), which only renders after the call finishes — reverted. TUI tsc clean (pre-existing execFileNoThrow type errors unrelated); subagentTree 35/35; display.py reverted to upstream. --- agent/tool_executor.py | 8 ++++++-- ui-tui/src/components/thinking.tsx | 9 +++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/agent/tool_executor.py b/agent/tool_executor.py index 358c1a0a8f7..b249de3de04 100644 --- a/agent/tool_executor.py +++ b/agent/tool_executor.py @@ -753,10 +753,14 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe elif function_name == "delegate_task": tasks_arg = function_args.get("tasks") if tasks_arg and isinstance(tasks_arg, list): - spinner_label = f"🔀 delegating {len(tasks_arg)} tasks" + spinner_label = f"🔀 delegating {len(tasks_arg)} tasks · (/agents to monitor)" else: goal_preview = (function_args.get("goal") or "")[:30] - spinner_label = f"🔀 {goal_preview}" if goal_preview else "🔀 delegating" + spinner_label = ( + f"🔀 {goal_preview} · (/agents to monitor)" + if goal_preview + else "🔀 delegating · (/agents to monitor)" + ) spinner = None if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner(): face = random.choice(KawaiiSpinner.get_waiting_faces()) diff --git a/ui-tui/src/components/thinking.tsx b/ui-tui/src/components/thinking.tsx index 0d9ecee87c9..ce90cca2138 100644 --- a/ui-tui/src/components/thinking.tsx +++ b/ui-tui/src/components/thinking.tsx @@ -1073,6 +1073,10 @@ export const ToolTrail = memo(function ToolTrail({ const branch: TreeBranch = index === groups.length - 1 ? 'last' : 'mid' const childRails = nextTreeRails(rails, branch) const hasInlineSubagents = inlineDelegateKey === group.key + // Surface the /agents hint the moment a delegate group appears — + // while it's still in-flight and before any subagent has + // registered — so users can open the live monitor immediately. + const isDelegateGroup = group.label.startsWith('Delegate Task') return ( @@ -1083,6 +1087,11 @@ export const ToolTrail = memo(function ToolTrail({ <> {toolLabel(group)} + {isDelegateGroup ? ( + + {' (/agents to monitor)'} + + ) : null} } rails={rails} From b4cf114f68da5d1de6b53cdc4a208d270e0654d7 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 30 May 2026 01:40:39 -0700 Subject: [PATCH 29/89] fix(vision): fail fast on non-retryable image download errors (#35221) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _download_image() wrapped every download attempt in a blanket `except Exception` and retried 3x with 2s/4s/8s backoff regardless of cause. A 404/403 image URL would never resolve on retry, so it just burned up to 6s of wall-clock + extra GETs before failing — inflating latency for a deterministic failure (issue #32296, umbrella #35114). Add _is_retryable_download_error(): 4xx client errors (except 429), website-policy PermissionError, and too-large/SSRF ValueError now raise on the first attempt. 429, 5xx, and unclassified network errors stay retryable. Removed the now-unreachable fall-through branch since the loop always returns on success or re-raises on the final/terminal attempt. --- tests/tools/test_vision_tools.py | 81 ++++++++++++++++++++++++++++++++ tools/vision_tools.py | 60 +++++++++++++++++------ 2 files changed, 127 insertions(+), 14 deletions(-) diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py index e3bff50d56f..7a50a4b4630 100644 --- a/tests/tools/test_vision_tools.py +++ b/tests/tools/test_vision_tools.py @@ -917,3 +917,84 @@ class TestIsImageSizeError: def test_empty_message(self): assert not _is_image_size_error(Exception("")) + + +class TestDownloadRetryClassification: + """Error-class-aware retry: 4xx fail-fast, 429/5xx/transient retried (issue #32296).""" + + @staticmethod + def _status_error(status_code): + import httpx + + request = httpx.Request("GET", "https://example.com/img.jpg") + response = httpx.Response(status_code, request=request) + return httpx.HTTPStatusError( + f"{status_code}", request=request, response=response + ) + + def _make_client_raising_status(self, status_code): + """AsyncClient whose response.raise_for_status() raises HTTPStatusError.""" + mock_response = MagicMock() + mock_response.raise_for_status = MagicMock( + side_effect=self._status_error(status_code) + ) + mock_client = AsyncMock() + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client.get = AsyncMock(return_value=mock_response) + return mock_client + + def test_is_retryable_classification(self): + from tools.vision_tools import _is_retryable_download_error + + # Non-retryable client errors + for code in (400, 403, 404, 410): + assert _is_retryable_download_error(self._status_error(code)) is False + # Retryable: rate limit + server errors + for code in (429, 500, 502, 503): + assert _is_retryable_download_error(self._status_error(code)) is True + # Policy/SSRF/size errors are terminal + assert _is_retryable_download_error(PermissionError("blocked")) is False + assert _is_retryable_download_error(ValueError("too large")) is False + # Unclassified (network blip) is retryable + assert _is_retryable_download_error(ConnectionError("reset")) is True + + @pytest.mark.asyncio + async def test_404_fails_fast_without_retry(self, tmp_path): + """A 404 must raise on the first attempt — no backoff sleep, no extra GETs.""" + import httpx + from tools.vision_tools import _download_image + + mock_client = self._make_client_raising_status(404) + with ( + patch("tools.vision_tools.httpx.AsyncClient", return_value=mock_client), + patch("tools.vision_tools.check_website_access", return_value=None), + patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep, + pytest.raises(httpx.HTTPStatusError), + ): + await _download_image( + "https://example.com/missing.jpg", tmp_path / "x.jpg", max_retries=3 + ) + # Exactly one attempt, zero backoff sleeps. + assert mock_client.get.await_count == 1 + mock_sleep.assert_not_called() + + @pytest.mark.asyncio + async def test_503_retries_then_raises(self, tmp_path): + """A 5xx is retried up to max_retries, sleeping between attempts.""" + import httpx + from tools.vision_tools import _download_image + + mock_client = self._make_client_raising_status(503) + with ( + patch("tools.vision_tools.httpx.AsyncClient", return_value=mock_client), + patch("tools.vision_tools.check_website_access", return_value=None), + patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep, + pytest.raises(httpx.HTTPStatusError), + ): + await _download_image( + "https://example.com/flaky.jpg", tmp_path / "y.jpg", max_retries=3 + ) + # All three attempts used, two backoff sleeps between them. + assert mock_client.get.await_count == 3 + assert mock_sleep.await_count == 2 diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 986f9dab984..23a0508fed1 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -127,6 +127,30 @@ def _detect_image_mime_type(image_path: Path) -> Optional[str]: return None +def _is_retryable_download_error(error: Exception) -> bool: + """Return True only for transient image-download failures worth retrying. + + Non-retryable (fail-fast): + - httpx.HTTPStatusError with a 4xx status other than 429 (404/403/410/...): + the resource is missing or forbidden; retrying can't change that. + - PermissionError: blocked by website policy / SSRF guard. + - ValueError: image too large or blocked redirect — deterministic. + + Retryable (transient): + - httpx 429 (rate limited) and 5xx (server-side) errors. + - Connection/timeout/transport errors (httpx.TransportError) and any + other unclassified exception, which may be a flaky network blip. + """ + if isinstance(error, (PermissionError, ValueError)): + return False + if isinstance(error, httpx.HTTPStatusError): + status = error.response.status_code + if 400 <= status < 500 and status != 429: + return False + return True + return True + + async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path: """ Download an image from a URL to a local destination (async) with retry logic. @@ -210,24 +234,32 @@ async def _download_image(image_url: str, destination: Path, max_retries: int = return destination except Exception as e: last_error = e - if attempt < max_retries - 1: - wait_time = 2 ** (attempt + 1) # 2s, 4s, 8s - logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50]) - logger.warning("Retrying in %ss...", wait_time) - await asyncio.sleep(wait_time) - else: + # Error-class-aware retry: only retry transient failures. A 4xx + # client error (404/403/410, etc.) will never succeed on retry — + # the resource isn't there or we're not allowed — so burning 3 + # attempts with 2s/4s/8s backoff just inflates latency. 429 (rate + # limit) and 5xx remain retryable. PermissionError (policy block) + # and ValueError (too-large / SSRF redirect) are also terminal. + if not _is_retryable_download_error(e) or attempt >= max_retries - 1: logger.error( - "Image download failed after %s attempts: %s", - max_retries, + "Image download failed after %s attempt(s): %s", + attempt + 1, str(e)[:100], exc_info=True, ) - - if last_error is None: - raise RuntimeError( - f"_download_image exited retry loop without attempting (max_retries={max_retries})" - ) - raise last_error + raise + wait_time = 2 ** (attempt + 1) # 2s, 4s, 8s + logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50]) + logger.warning("Retrying in %ss...", wait_time) + await asyncio.sleep(wait_time) + + # The loop always returns on success or re-raises on the final/non-retryable + # attempt, so reaching here means max_retries was non-positive. + if last_error is not None: + raise last_error + raise RuntimeError( + f"_download_image exited retry loop without attempting (max_retries={max_retries})" + ) def _determine_mime_type(image_path: Path) -> str: From 64998fa93e2bd52ee191701ea50c0febcc8e3dc6 Mon Sep 17 00:00:00 2001 From: annguyenNous Date: Sat, 30 May 2026 10:45:57 +0700 Subject: [PATCH 30/89] fix(tui): use base64 encoding for PowerShell clipboard writes to preserve UTF-8 When writing text to the clipboard via PowerShell (WSL2 and native Windows), the previous implementation piped text through stdin using `Set-Clipboard -Value $input`. PowerShell reads stdin using the Windows system's default ANSI code page (e.g. CP936 for Chinese Windows), causing all non-ASCII characters (CJK, emoji, accented) to become garbled. Fix: encode the text as base64 in Node.js and pass it as a command argument. PowerShell decodes it from base64 using explicit UTF-8, bypassing the code page issue entirely. Fixes #35107 --- ui-tui/src/__tests__/clipboard.test.ts | 44 ++++++++++++++++++++++-- ui-tui/src/lib/clipboard.ts | 46 ++++++++++++++++++-------- 2 files changed, 74 insertions(+), 16 deletions(-) diff --git a/ui-tui/src/__tests__/clipboard.test.ts b/ui-tui/src/__tests__/clipboard.test.ts index b0646ee488e..93feb009d87 100644 --- a/ui-tui/src/__tests__/clipboard.test.ts +++ b/ui-tui/src/__tests__/clipboard.test.ts @@ -269,7 +269,14 @@ describe('writeClipboardText', () => { expect.arrayContaining(['-NoProfile', '-NonInteractive']), expect.anything() ) - expect(stdin.end).toHaveBeenCalledWith('wsl text') + // PowerShell uses base64-encoded UTF-8 via command argument, not stdin + expect(stdin.end).not.toHaveBeenCalled() + const calledArgs = start.mock.calls[0][1] as string[] + const commandIdx = calledArgs.indexOf('-Command') + expect(commandIdx).toBeGreaterThan(-1) + const script = calledArgs[commandIdx + 1] + expect(script).toContain('FromBase64String') + expect(script).toContain(Buffer.from('wsl text', 'utf8').toString('base64')) }) it('prefers the Windows clipboard path over wl-copy inside WSLg', async () => { @@ -300,7 +307,13 @@ describe('writeClipboardText', () => { expect.arrayContaining(['-NoProfile', '-NonInteractive']), expect.anything() ) - expect(stdin.end).toHaveBeenCalledWith('wslg text') + // PowerShell uses base64-encoded UTF-8 via command argument, not stdin + expect(stdin.end).not.toHaveBeenCalled() + const calledArgs = start.mock.calls[0][1] as string[] + const commandIdx = calledArgs.indexOf('-Command') + const script = calledArgs[commandIdx + 1] + expect(script).toContain('FromBase64String') + expect(script).toContain(Buffer.from('wslg text', 'utf8').toString('base64')) }) it('uses PowerShell on Windows', async () => { @@ -325,5 +338,32 @@ describe('writeClipboardText', () => { expect.arrayContaining(['-NoProfile', '-NonInteractive']), expect.anything() ) + // PowerShell uses base64-encoded UTF-8 via command argument, not stdin + expect(stdin.end).not.toHaveBeenCalled() + }) + + it('preserves CJK text via base64 encoding in PowerShell on WSL', async () => { + const stdin = { end: vi.fn() } + + const child = { + once: vi.fn((event: string, cb: (code?: number) => void) => { + if (event === 'close') { + cb(0) + } + + return child + }), + stdin + } + + const start = vi.fn().mockReturnValue(child) + const cjkText = '你好世界,测试中文 🎉' + + await expect(writeClipboardText(cjkText, 'linux', start as any, { WSL_INTEROP: '/tmp/socket' })).resolves.toBe(true) + const calledArgs = start.mock.calls[0][1] as string[] + const commandIdx = calledArgs.indexOf('-Command') + const script = calledArgs[commandIdx + 1] + expect(script).toContain(Buffer.from(cjkText, 'utf8').toString('base64')) + expect(script).toContain('UTF8.GetString') }) }) diff --git a/ui-tui/src/lib/clipboard.ts b/ui-tui/src/lib/clipboard.ts index 587e8986c3e..93472de7d5d 100644 --- a/ui-tui/src/lib/clipboard.ts +++ b/ui-tui/src/lib/clipboard.ts @@ -91,33 +91,44 @@ export async function readClipboardText( return null } +type WriteCmd = { args: readonly string[]; cmd: string } & ( + | { stdin: true } + | { stdin: false; psScript: (b64: string) => string } +) + +function _powershellWriteScript(b64: string): string { + return `Set-Clipboard -Value ([System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String('${b64}')))` +} + function writeClipboardCommands( platform: NodeJS.Platform, env: NodeJS.ProcessEnv -): Array<{ args: readonly string[]; cmd: string }> { +): WriteCmd[] { if (platform === 'darwin') { - return [{ cmd: 'pbcopy', args: [] }] + return [{ cmd: 'pbcopy', args: [], stdin: true }] } if (platform === 'win32') { - return [{ cmd: 'powershell', args: ['-NoProfile', '-NonInteractive', '-Command', 'Set-Clipboard -Value $input'] }] + return [{ cmd: 'powershell', args: ['-NoProfile', '-NonInteractive'], stdin: false, psScript: _powershellWriteScript }] } - const attempts: Array<{ args: readonly string[]; cmd: string }> = [] + const attempts: WriteCmd[] = [] if (env.WSL_INTEROP || env.WSL_DISTRO_NAME) { attempts.push({ cmd: 'powershell.exe', - args: ['-NoProfile', '-NonInteractive', '-Command', 'Set-Clipboard -Value $input'] + args: ['-NoProfile', '-NonInteractive'], + stdin: false, + psScript: _powershellWriteScript }) } if (env.WAYLAND_DISPLAY) { - attempts.push({ cmd: 'wl-copy', args: ['--type', 'text/plain'] }) + attempts.push({ cmd: 'wl-copy', args: ['--type', 'text/plain'], stdin: true }) } - attempts.push({ cmd: 'xclip', args: ['-selection', 'clipboard', '-in'] }) - attempts.push({ cmd: 'xsel', args: ['--clipboard', '--input'] }) + attempts.push({ cmd: 'xclip', args: ['-selection', 'clipboard', '-in'], stdin: true }) + attempts.push({ cmd: 'xsel', args: ['--clipboard', '--input'], stdin: true }) return attempts } @@ -144,14 +155,21 @@ export async function writeClipboardText( ): Promise { const candidates = writeClipboardCommands(platform, env) - for (const { cmd, args } of candidates) { + for (const cmdEntry of candidates) { try { const ok = await new Promise(resolve => { - const child = start(cmd, [...args], { stdio: ['pipe', 'ignore', 'ignore'], windowsHide: true }) - - child.once('error', () => resolve(false)) - child.once('close', code => resolve(code === 0)) - child.stdin?.end(text) + if (cmdEntry.stdin) { + const child = start(cmdEntry.cmd, [...cmdEntry.args], { stdio: ['pipe', 'ignore', 'ignore'], windowsHide: true }) + child.once('error', () => resolve(false)) + child.once('close', (code: number | null) => resolve(code === 0)) + child.stdin?.end(text) + } else { + const b64 = Buffer.from(text, 'utf8').toString('base64') + const script = cmdEntry.psScript(b64) + const child = start(cmdEntry.cmd, [...cmdEntry.args, '-Command', script], { stdio: ['ignore', 'ignore', 'ignore'], windowsHide: true }) + child.once('error', () => resolve(false)) + child.once('close', (code: number | null) => resolve(code === 0)) + } }) if (ok) { From 16882cfded90b8c41ff18000c56a84d7f17628b7 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sat, 30 May 2026 00:33:14 -0700 Subject: [PATCH 31/89] refactor(tui): simplify base64 clipboard write to a stdin flag The per-entry psScript callback was identical for every PowerShell entry, so the function-valued union member added structure without behavior. Collapse WriteCmd to a plain stdin boolean and apply the one shared base64 script in the write loop. Document the CP936 root cause inline. Co-authored-by: BROCCOLO1D <279959838+BROCCOLO1D@users.noreply.github.com> --- ui-tui/src/lib/clipboard.ts | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/ui-tui/src/lib/clipboard.ts b/ui-tui/src/lib/clipboard.ts index 93472de7d5d..4a5387ae2d2 100644 --- a/ui-tui/src/lib/clipboard.ts +++ b/ui-tui/src/lib/clipboard.ts @@ -91,10 +91,13 @@ export async function readClipboardText( return null } -type WriteCmd = { args: readonly string[]; cmd: string } & ( - | { stdin: true } - | { stdin: false; psScript: (b64: string) => string } -) +// PowerShell on Windows/WSL decodes piped stdin with the system ANSI code +// page (e.g. CP936), not UTF-8, so $input-based writes mangle CJK/emoji. We +// instead base64-encode the UTF-8 bytes and pass them as a -Command argument, +// decoding with UTF8.GetString — this removes the stdin-encoding variable +// entirely (also immune to BOM injection on redirect). PowerShell entries set +// stdin=false; every other backend reads UTF-8 stdin natively. +type WriteCmd = { args: readonly string[]; cmd: string; stdin: boolean } function _powershellWriteScript(b64: string): string { return `Set-Clipboard -Value ([System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String('${b64}')))` @@ -109,18 +112,13 @@ function writeClipboardCommands( } if (platform === 'win32') { - return [{ cmd: 'powershell', args: ['-NoProfile', '-NonInteractive'], stdin: false, psScript: _powershellWriteScript }] + return [{ cmd: 'powershell', args: ['-NoProfile', '-NonInteractive'], stdin: false }] } const attempts: WriteCmd[] = [] if (env.WSL_INTEROP || env.WSL_DISTRO_NAME) { - attempts.push({ - cmd: 'powershell.exe', - args: ['-NoProfile', '-NonInteractive'], - stdin: false, - psScript: _powershellWriteScript - }) + attempts.push({ cmd: 'powershell.exe', args: ['-NoProfile', '-NonInteractive'], stdin: false }) } if (env.WAYLAND_DISPLAY) { @@ -165,7 +163,7 @@ export async function writeClipboardText( child.stdin?.end(text) } else { const b64 = Buffer.from(text, 'utf8').toString('base64') - const script = cmdEntry.psScript(b64) + const script = _powershellWriteScript(b64) const child = start(cmdEntry.cmd, [...cmdEntry.args, '-Command', script], { stdio: ['ignore', 'ignore', 'ignore'], windowsHide: true }) child.once('error', () => resolve(false)) child.once('close', (code: number | null) => resolve(code === 0)) From c70dca3a8856a6e6b5cc40f07deeac4703f22a5f Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sat, 30 May 2026 00:31:52 -0700 Subject: [PATCH 32/89] fix(kanban): rebuild legacy TEXT-PK tables to INTEGER AUTOINCREMENT on open MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Legacy kanban boards (pre-AUTOINCREMENT schema) crashed the gateway notifier on every tick — int(None) on a NULL id in unseen_events_for_sub — silently losing all kanban notifications. CREATE TABLE IF NOT EXISTS skips existing tables regardless of schema and _add_column_if_missing only adds columns, so neither could fix a drifted primary-key type. _rebuild_drifted_tables() detects the legacy shape via PRAGMA table_info and rebuilds task_events/task_comments/task_runs (TEXT PK -> INTEGER AUTOINCREMENT) and kanban_notify_subs.last_event_id (TEXT/NULL -> INTEGER NOT NULL DEFAULT 0), preserving data. The whole pass is one transaction so an interruption can't leave a table half-renamed, and recreates every index DROP TABLE would otherwise take down (including idx_events_run). Co-authored-by: liuhao1024 --- hermes_cli/kanban_db.py | 134 +++++++++++++++++++++++ tests/hermes_cli/test_kanban_db_init.py | 139 ++++++++++++++++++++++++ 2 files changed, 273 insertions(+) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 5e465e87a6f..c0e8372c727 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -1637,6 +1637,140 @@ def _migrate_add_optional_columns(conn: sqlite3.Connection) -> None: (new, old), ) + _rebuild_drifted_tables(conn) + + +# Legacy DBs defined these tables with a ``TEXT PRIMARY KEY`` id (or, for +# ``kanban_notify_subs``, a nullable ``TEXT last_event_id``). The current +# schema uses ``INTEGER PRIMARY KEY AUTOINCREMENT`` / ``INTEGER NOT NULL +# DEFAULT 0``. ``CREATE TABLE IF NOT EXISTS`` skips existing tables +# regardless of schema and ``_add_column_if_missing`` only adds columns, so +# neither can fix a drifted column type — the table must be rebuilt. See +# #35096. +# +# Each entry pairs the canonical CREATE TABLE with the CREATE INDEX +# statements that DROP TABLE would otherwise take down with it (including +# ``idx_events_run``, added by the additive pass above). To guard against +# this list drifting from SCHEMA_SQL, ``test_rebuilt_schema_matches_fresh`` +# asserts a rebuilt legacy DB is byte-identical to a fresh one. +_REBUILD_SPECS = { + "task_events": ( + "CREATE TABLE task_events (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " task_id TEXT NOT NULL, run_id INTEGER, kind TEXT NOT NULL," + " payload TEXT, created_at INTEGER NOT NULL)", + ( + "CREATE INDEX idx_events_task ON task_events(task_id, created_at)", + "CREATE INDEX idx_events_run ON task_events(run_id, id)", + ), + ), + "task_comments": ( + "CREATE TABLE task_comments (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " task_id TEXT NOT NULL, author TEXT NOT NULL, body TEXT NOT NULL," + " created_at INTEGER NOT NULL)", + ("CREATE INDEX idx_comments_task ON task_comments(task_id, created_at)",), + ), + "task_runs": ( + "CREATE TABLE task_runs (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " task_id TEXT NOT NULL, profile TEXT, step_key TEXT," + " status TEXT NOT NULL, claim_lock TEXT, claim_expires INTEGER," + " worker_pid INTEGER, max_runtime_seconds INTEGER," + " last_heartbeat_at INTEGER, started_at INTEGER NOT NULL," + " ended_at INTEGER, outcome TEXT, summary TEXT, metadata TEXT," + " error TEXT)", + ( + "CREATE INDEX idx_runs_task ON task_runs(task_id, started_at)", + "CREATE INDEX idx_runs_status ON task_runs(status)", + ), + ), + "kanban_notify_subs": ( + "CREATE TABLE kanban_notify_subs (" + " task_id TEXT NOT NULL, platform TEXT NOT NULL, chat_id TEXT NOT NULL," + " thread_id TEXT NOT NULL DEFAULT '', user_id TEXT," + " notifier_profile TEXT, created_at INTEGER NOT NULL," + " last_event_id INTEGER NOT NULL DEFAULT 0," + " PRIMARY KEY (task_id, platform, chat_id, thread_id))", + ("CREATE INDEX idx_notify_task ON kanban_notify_subs(task_id)",), + ), +} + + +def _table_has_drifted(conn: sqlite3.Connection, table: str) -> bool: + """True when ``table`` still carries the legacy (pre-AUTOINCREMENT) shape.""" + info = conn.execute(f"PRAGMA table_info({table})").fetchall() + if not info: + return False # table absent — nothing to rebuild + if table == "kanban_notify_subs": + lei = next((c for c in info if c["name"] == "last_event_id"), None) + return lei is not None and (lei["type"] or "").upper() != "INTEGER" + # task_events / task_comments / task_runs: id must be INTEGER and a PK. + id_col = next((c for c in info if c["name"] == "id"), None) + if id_col is None: + return False + return not ((id_col["type"] or "").upper() == "INTEGER" and id_col["pk"]) + + +def _rebuild_drifted_tables(conn: sqlite3.Connection) -> None: + """Rebuild any kanban table whose column types drifted from SCHEMA_SQL. + + Old boards crash the gateway notifier (``int(None)`` on a NULL id in + ``unseen_events_for_sub``) and never match the ``id > cursor`` filter, so + every kanban notification is silently lost (#35096). Each affected table is + rebuilt with the standard SQLite pattern — CREATE new → INSERT shared + columns → DROP old → RENAME — recreating its indexes too (DROP TABLE takes + them down). The legacy TEXT ids are dropped (they aren't valid integers); + AUTOINCREMENT assigns fresh ones and ``last_event_id`` cursors reset to 0, + so the first post-migration tick replays a task's event history once — + the safe failure mode for a feature that was already fully broken. + + The whole pass runs in one transaction so an interruption can't leave a + table half-renamed, and under ``connect()``'s init locks so nothing races + it. Idempotent: a correctly-typed DB skips every table and returns without + opening a transaction. + """ + drifted = [t for t in _REBUILD_SPECS if _table_has_drifted(conn, t)] + if not drifted: + return + + conn.execute("BEGIN IMMEDIATE") + try: + for table in drifted: + create_sql, index_sqls = _REBUILD_SPECS[table] + old_cols = [c["name"] for c in conn.execute(f"PRAGMA table_info({table})")] + _log.info("kanban migration: rebuilding %s to match current schema", table) + conn.execute(f"ALTER TABLE {table} RENAME TO {table}_legacy") + conn.execute(create_sql) + new_cols = {c["name"] for c in conn.execute(f"PRAGMA table_info({table})")} + if table == "kanban_notify_subs": + # Cast the legacy TEXT cursor to INTEGER; NULL / non-numeric → 0. + shared = [c for c in old_cols if c in new_cols and c != "last_event_id"] + cols_csv = ", ".join(shared) + conn.execute( + f"INSERT INTO {table} ({cols_csv}, last_event_id) " + f"SELECT {cols_csv}, COALESCE(CAST(last_event_id AS INTEGER), 0) " + f"FROM {table}_legacy" + ) + else: + # Drop the legacy TEXT id; AUTOINCREMENT reassigns it. + shared = [c for c in old_cols if c in new_cols and c != "id"] + cols_csv = ", ".join(shared) + conn.execute( + f"INSERT INTO {table} ({cols_csv}) " + f"SELECT {cols_csv} FROM {table}_legacy" + ) + conn.execute(f"DROP TABLE {table}_legacy") + for index_sql in index_sqls: + conn.execute(index_sql) + conn.execute("COMMIT") + except Exception: + try: + conn.execute("ROLLBACK") + except sqlite3.OperationalError: + pass + raise + def _check_file_length_invariant(conn: sqlite3.Connection) -> None: """Read the SQLite header page_count and compare against actual file size. diff --git a/tests/hermes_cli/test_kanban_db_init.py b/tests/hermes_cli/test_kanban_db_init.py index c400b1d90f9..7db5d2009e6 100644 --- a/tests/hermes_cli/test_kanban_db_init.py +++ b/tests/hermes_cli/test_kanban_db_init.py @@ -1,11 +1,74 @@ from __future__ import annotations +import sqlite3 import threading from pathlib import Path from hermes_cli import kanban_db as kb +def _make_legacy_db(path: Path) -> None: + """Write a kanban DB with the pre-AUTOINCREMENT (TEXT PK) schema for the + four tables #35096 affects, keeping every other table current so the + additive-column migration runs cleanly on top. + """ + conn = sqlite3.connect(str(path)) + conn.executescript(kb.SCHEMA_SQL) + conn.executescript( + """ + DROP TABLE task_events; + DROP TABLE task_comments; + DROP TABLE task_runs; + DROP TABLE kanban_notify_subs; + CREATE TABLE task_comments (id TEXT PRIMARY KEY, task_id TEXT NOT NULL, + author TEXT NOT NULL, body TEXT NOT NULL, created_at INTEGER NOT NULL); + CREATE TABLE task_events (id TEXT PRIMARY KEY, task_id TEXT NOT NULL, + kind TEXT NOT NULL, payload TEXT, created_at INTEGER NOT NULL); + CREATE TABLE task_runs (id TEXT PRIMARY KEY, task_id TEXT NOT NULL, + profile TEXT, status TEXT NOT NULL, started_at INTEGER NOT NULL); + CREATE TABLE kanban_notify_subs (task_id TEXT NOT NULL, platform TEXT NOT NULL, + chat_id TEXT NOT NULL, thread_id TEXT NOT NULL DEFAULT '', user_id TEXT, + created_at INTEGER NOT NULL, last_event_id TEXT, + PRIMARY KEY (task_id, platform, chat_id, thread_id)); + """ + ) + conn.execute("INSERT INTO tasks (id, title, status, created_at) VALUES ('task-1', 'T', 'done', 1000)") + conn.execute("INSERT INTO task_comments VALUES ('c-1', 'task-1', 'agent', 'hi', 1500)") + conn.execute("INSERT INTO task_events VALUES ('e-1', 'task-1', 'completed', NULL, 2000)") + conn.execute("INSERT INTO task_events VALUES ('e-2', 'task-1', 'blocked', NULL, 2100)") + conn.execute("INSERT INTO task_runs VALUES ('r-1', 'task-1', 'default', 'done', 1000)") + conn.execute( + "INSERT INTO kanban_notify_subs (task_id, platform, chat_id, created_at, last_event_id) " + "VALUES ('task-1', 'telegram', '123', 1000, 'e-1')" + ) + conn.commit() + conn.close() + + +def _setup_home(tmp_path, monkeypatch) -> Path: + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(home)) + monkeypatch.setattr(Path, "home", lambda: tmp_path) + db_path = kb.kanban_db_path(board="legacy") + db_path.parent.mkdir(parents=True, exist_ok=True) + kb._INITIALIZED_PATHS.discard(str(db_path.resolve())) + return db_path + + +def _table_struct(conn: sqlite3.Connection, table: str): + cols = [ + (r["name"], (r["type"] or "").upper(), r["notnull"], r["pk"]) + for r in conn.execute(f"PRAGMA table_info({table})") + ] + idx = sorted( + r["name"] + for r in conn.execute(f"PRAGMA index_list({table})") + if not r["name"].startswith("sqlite_") + ) + return cols, idx + + def test_connect_initialization_is_thread_safe(tmp_path, monkeypatch): home = tmp_path / ".hermes" home.mkdir() @@ -36,3 +99,79 @@ def test_connect_initialization_is_thread_safe(tmp_path, monkeypatch): with kb.connect(board="default") as conn: cols = {row["name"] for row in conn.execute("PRAGMA table_info(tasks)")} assert "max_retries" in cols + + +def test_legacy_text_pk_tables_rebuilt_to_integer_autoincrement(tmp_path, monkeypatch): + """A pre-AUTOINCREMENT DB is migrated in place: id columns become INTEGER + PKs, ``last_event_id`` becomes INTEGER, data is preserved, and indexes + are recreated (DROP TABLE would otherwise take them down).""" + db_path = _setup_home(tmp_path, monkeypatch) + _make_legacy_db(db_path) + + with kb.connect(db_path) as conn: + for table in ("task_events", "task_comments", "task_runs"): + id_col = {r["name"]: r for r in conn.execute(f"PRAGMA table_info({table})")}["id"] + assert id_col["type"].upper() == "INTEGER" and id_col["pk"] == 1 + + lei = {r["name"]: r for r in conn.execute("PRAGMA table_info(kanban_notify_subs)")} + assert lei["last_event_id"]["type"].upper() == "INTEGER" + + # Data preserved across the rebuild. + assert len(conn.execute("SELECT * FROM task_events").fetchall()) == 2 + assert conn.execute("SELECT body FROM task_comments").fetchone()["body"] == "hi" + assert len(conn.execute("SELECT * FROM task_runs").fetchall()) == 1 + # Non-numeric legacy cursor ("e-1") casts to 0. + assert conn.execute("SELECT last_event_id FROM kanban_notify_subs").fetchone()["last_event_id"] == 0 + + # Indexes restored, including idx_events_run (added by the additive pass). + indexes = {r[0] for r in conn.execute("SELECT name FROM sqlite_master WHERE type='index'")} + for name in ("idx_events_task", "idx_events_run", "idx_comments_task", + "idx_runs_task", "idx_runs_status", "idx_notify_task"): + assert name in indexes + + # AUTOINCREMENT actually works after the rebuild. + conn.execute("INSERT INTO task_events (task_id, kind, created_at) VALUES ('task-1', 'completed', 3000)") + new_id = conn.execute("SELECT id FROM task_events ORDER BY id DESC LIMIT 1").fetchone()["id"] + assert isinstance(new_id, int) and new_id >= 1 + + +def test_rebuilt_schema_matches_fresh_db(tmp_path, monkeypatch): + """The rebuilt tables must be structurally identical to a fresh DB, so the + hand-written DDL in ``_REBUILD_SPECS`` can't silently drift from SCHEMA_SQL.""" + legacy_path = _setup_home(tmp_path, monkeypatch) + _make_legacy_db(legacy_path) + fresh_path = kb.kanban_db_path(board="fresh") + fresh_path.parent.mkdir(parents=True, exist_ok=True) + kb._INITIALIZED_PATHS.discard(str(fresh_path.resolve())) + + with kb.connect(legacy_path) as migrated, kb.connect(fresh_path) as fresh: + for table in ("task_events", "task_comments", "task_runs", "kanban_notify_subs"): + assert _table_struct(migrated, table) == _table_struct(fresh, table) + + +def test_migration_is_idempotent(tmp_path, monkeypatch): + """Re-opening an already-migrated DB is a no-op and leaves data intact.""" + db_path = _setup_home(tmp_path, monkeypatch) + _make_legacy_db(db_path) + + with kb.connect(db_path): + pass + kb._INITIALIZED_PATHS.discard(str(db_path.resolve())) + with kb.connect(db_path) as conn: + id_col = {r["name"]: r for r in conn.execute("PRAGMA table_info(task_events)")}["id"] + assert id_col["type"].upper() == "INTEGER" + assert len(conn.execute("SELECT * FROM task_events").fetchall()) == 2 + + +def test_unseen_events_for_sub_survives_migrated_db(tmp_path, monkeypatch): + """The crash that motivated #35096 — ``int(None)`` on a NULL cursor — is + gone after migration; the notifier query returns an integer cursor.""" + db_path = _setup_home(tmp_path, monkeypatch) + _make_legacy_db(db_path) + + with kb.connect(db_path) as conn: + cursor, events = kb.unseen_events_for_sub( + conn, task_id="task-1", platform="telegram", chat_id="123" + ) + assert isinstance(cursor, int) + assert isinstance(events, list) From 6ab71d3bb4cca36712b6895fd1bcc38fd3b9be4f Mon Sep 17 00:00:00 2001 From: liuhao1024 Date: Sat, 30 May 2026 09:34:17 +0800 Subject: [PATCH 33/89] fix(kanban): prevent infinite retry loop when worker exhausts iteration budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit recompute_ready() previously reset consecutive_failures to 0 when auto-recovering a blocked task. This defeated the circuit-breaker: a task that repeatedly exhausted its iteration budget would cycle forever (block → auto-recover with counter=0 → respawn → budget exhausted → block → …) with no signal to the operator. Fix: don't auto-recover tasks whose consecutive_failures has reached the effective failure limit (per-task max_retries or DEFAULT_FAILURE_LIMIT). The counter is also preserved across recovery so the breaker can accumulate across cycles. Fixes #35072 --- hermes_cli/kanban_db.py | 43 +++++++++++---- tests/hermes_cli/test_kanban_db.py | 89 ++++++++++++++++++++++++++++-- 2 files changed, 114 insertions(+), 18 deletions(-) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index c0e8372c727..95ba0d55c00 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -2599,17 +2599,24 @@ def recompute_ready(conn: sqlite3.Connection) -> int: ``blocked`` tasks are also considered for promotion (so a task blocked purely by a parent dependency unblocks itself when the - parent completes), *except* when the most recent block event was a - worker-initiated ``kanban_block`` — those stay blocked until an - explicit ``kanban_unblock`` (#28712). Without that guard, a - ``review-required`` handoff would auto-respawn, the fresh worker - would find nothing to do, exit cleanly, get recorded as a protocol - violation, and the cycle would repeat indefinitely. + parent completes), *except* in two cases: + + 1. The most recent block event was a worker-initiated + ``kanban_block`` — those stay blocked until an explicit + ``kanban_unblock`` (#28712). + + 2. The task's ``consecutive_failures`` has reached the effective + failure limit (per-task ``max_retries`` or + ``DEFAULT_FAILURE_LIMIT``). This prevents infinite retry + loops when a task repeatedly exhausts its iteration budget: + without this guard the counter would reset on every recovery + cycle and the circuit breaker could never trip. """ promoted = 0 with write_txn(conn): todo_rows = conn.execute( - "SELECT id, status FROM tasks WHERE status IN ('todo', 'blocked')" + "SELECT id, status, consecutive_failures, max_retries " + "FROM tasks WHERE status IN ('todo', 'blocked')" ).fetchall() for row in todo_rows: task_id = row["id"] @@ -2627,13 +2634,25 @@ def recompute_ready(conn: sqlite3.Connection) -> int: (task_id,), ).fetchall() if all(p["status"] in ("done", "archived") for p in parents): - # Blocked tasks also get their failure counters reset — - # this is effectively an auto-unblock (circuit-breaker - # recovery; worker-initiated blocks are skipped above). if cur_status == "blocked": + # Don't auto-recover tasks that have hit the + # circuit-breaker failure limit. Without this + # guard, a task that repeatedly exhausts its + # iteration budget would cycle forever: + # block → auto-recover → respawn → budget + # exhausted → block → … The counter must also + # be preserved so the breaker can accumulate + # across recovery cycles. + failures = int(row["consecutive_failures"] or 0) + task_limit = row["max_retries"] + effective_limit = ( + int(task_limit) if task_limit is not None + else DEFAULT_FAILURE_LIMIT + ) + if failures >= effective_limit: + continue conn.execute( - "UPDATE tasks SET status = 'ready', " - "consecutive_failures = 0, last_failure_error = NULL " + "UPDATE tasks SET status = 'ready' " "WHERE id = ? AND status = 'blocked'", (task_id,), ) diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index 020ad4fb425..8e333189177 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -307,7 +307,8 @@ def test_recompute_ready_cascades_through_chain(kanban_home): def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home): - """blocked tasks with all parents done should be promoted to ready.""" + """blocked tasks with all parents done should be promoted to ready, + unless the circuit-breaker failure limit has been reached.""" with kb.connect() as conn: parent = kb.create_task(conn, title="parent", assignee="a") child = kb.create_task( @@ -316,16 +317,16 @@ def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home): # Complete the parent kb.claim_task(conn, parent) kb.complete_task(conn, parent, result="ok") - # Manually block the child (simulates a worker that failed - # after the parent finished) + # Manually block the child with zero failures (simulates a + # dependency block, not a circuit-breaker block). conn.execute( - "UPDATE tasks SET status='blocked', consecutive_failures=5, " - "last_failure_error='persistent error' WHERE id=?", + "UPDATE tasks SET status='blocked', consecutive_failures=0, " + "last_failure_error=NULL WHERE id=?", (child,), ) conn.commit() assert kb.get_task(conn, child).status == "blocked" - # recompute_ready should promote blocked → ready and reset failures + # recompute_ready should promote blocked → ready promoted = kb.recompute_ready(conn) assert promoted == 1 task = kb.get_task(conn, child) @@ -815,6 +816,82 @@ def test_unblock_resets_failure_counters(kanban_home): assert task.last_failure_error is None +def test_recompute_ready_skips_tasks_at_failure_limit(kanban_home): + """recompute_ready must not auto-recover tasks whose consecutive_failures + has reached the circuit-breaker limit (#35072). + + Without this guard, a task that repeatedly exhausts its iteration + budget would cycle forever: block → auto-recover (counter reset) + → respawn → budget exhausted → block → … + """ + with kb.connect() as conn: + parent = kb.create_task(conn, title="parent", assignee="a") + child = kb.create_task(conn, title="child", assignee="a", + parents=[parent]) + # Complete the parent so the child's dependencies are satisfied. + kb.claim_task(conn, parent) + kb.complete_task(conn, parent, summary="done") + + # Simulate the child having exhausted its budget twice, + # hitting the default failure limit (2). + kb.claim_task(conn, child) + kb._record_task_failure( + conn, child, error="budget exhausted 1", + outcome="timed_out", release_claim=True, end_run=True, + failure_limit=2, + ) + kb._record_task_failure( + conn, child, error="budget exhausted 2", + outcome="timed_out", release_claim=True, end_run=True, + failure_limit=2, + ) + task = kb.get_task(conn, child) + assert task.status == "blocked" + assert task.consecutive_failures >= 2 + + # recompute_ready must NOT promote this task — the circuit + # breaker has tripped and it should stay blocked. + promoted = kb.recompute_ready(conn) + assert promoted == 0 + assert kb.get_task(conn, child).status == "blocked" + + # Explicit unblock should still work and reset the counter. + assert kb.unblock_task(conn, child) + task = kb.get_task(conn, child) + assert task.status == "ready" + assert task.consecutive_failures == 0 + + +def test_recompute_ready_recovers_below_limit(kanban_home): + """recompute_ready auto-recovers blocked tasks that haven't hit the + failure limit yet — the counter is preserved across recovery.""" + with kb.connect() as conn: + t = kb.create_task(conn, title="task", assignee="a") + kb.claim_task(conn, t) + # One failure, below the default limit of 2. + kb._record_task_failure( + conn, t, error="budget exhausted 1", + outcome="timed_out", release_claim=True, end_run=True, + failure_limit=2, + ) + task = kb.get_task(conn, t) + assert task.status == "ready" + assert task.consecutive_failures == 1 + + # Simulate being blocked by something else (not circuit breaker). + conn.execute( + "UPDATE tasks SET status = 'blocked' WHERE id = ?", (t,), + ) + conn.commit() + + promoted = kb.recompute_ready(conn) + assert promoted == 1 + task = kb.get_task(conn, t) + assert task.status == "ready" + # Counter must be preserved, not reset. + assert task.consecutive_failures == 1 + + # --------------------------------------------------------------------------- # Parent-completion invariant at the claim gate (RCA t_a6acd07d) # --------------------------------------------------------------------------- From 8e5a6854c3bf081c46df2600775f14bbbde9cc2d Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 21:31:55 -0700 Subject: [PATCH 34/89] fix(kanban): align recompute_ready guard with breaker's configured failure_limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the budget-exhaustion recovery fix. recompute_ready's new circuit-breaker guard resolved its effective limit from per-task max_retries -> DEFAULT_FAILURE_LIMIT, skipping the dispatcher's configured kanban.failure_limit. _record_task_failure resolves max_retries -> failure_limit(config) -> DEFAULT, so the two disagreed whenever an operator set kanban.failure_limit != 2: - config > 2: a task could get stuck at DEFAULT(2) before reaching its allowed retry count. - config < 2: a task the breaker already blocked could be auto-recovered back to ready, defeating the stricter limit. Thread the dispatcher's failure_limit through dispatch_once into recompute_ready so the guard and the breaker share one resolution order. Updated test_circuit_breaker_block_still_auto_promotes (it asserted a failures=5 block auto-recovers and resets the counter — that's the pre-#35072 behavior the loop fix removes); it now exercises a below-limit transient block, with the at-limit case covered in test_kanban_db.py. Added two tests for the config-tier and per-task override resolution. --- hermes_cli/kanban_db.py | 28 +++++--- .../hermes_cli/test_kanban_blocked_sticky.py | 31 ++++++--- tests/hermes_cli/test_kanban_db.py | 67 +++++++++++++++++++ 3 files changed, 108 insertions(+), 18 deletions(-) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 95ba0d55c00..17fe7476dfe 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -2591,7 +2591,9 @@ def _has_sticky_block(conn: sqlite3.Connection, task_id: str) -> bool: return bool(row) and row["kind"] == "blocked" -def recompute_ready(conn: sqlite3.Connection) -> int: +def recompute_ready( + conn: sqlite3.Connection, failure_limit: int = None, +) -> int: """Promote ``todo`` tasks to ``ready`` when all parents are ``done`` or ``archived``. Returns the number of tasks promoted. Safe to call inside or outside @@ -2606,12 +2608,22 @@ def recompute_ready(conn: sqlite3.Connection) -> int: ``kanban_unblock`` (#28712). 2. The task's ``consecutive_failures`` has reached the effective - failure limit (per-task ``max_retries`` or - ``DEFAULT_FAILURE_LIMIT``). This prevents infinite retry - loops when a task repeatedly exhausts its iteration budget: - without this guard the counter would reset on every recovery - cycle and the circuit breaker could never trip. + failure limit. This prevents infinite retry loops when a task + repeatedly exhausts its iteration budget: without this guard the + counter would reset on every recovery cycle and the circuit + breaker could never trip (#35072). + + The effective failure limit resolves in the same order as the + circuit breaker in ``_record_task_failure`` so the two never + disagree about when a task is permanently blocked: + + 1. per-task ``max_retries`` if set + 2. caller-supplied ``failure_limit`` (the dispatcher passes the + ``kanban.failure_limit`` config value through ``dispatch_once``) + 3. ``DEFAULT_FAILURE_LIMIT`` """ + if failure_limit is None: + failure_limit = DEFAULT_FAILURE_LIMIT promoted = 0 with write_txn(conn): todo_rows = conn.execute( @@ -2647,7 +2659,7 @@ def recompute_ready(conn: sqlite3.Connection) -> int: task_limit = row["max_retries"] effective_limit = ( int(task_limit) if task_limit is not None - else DEFAULT_FAILURE_LIMIT + else int(failure_limit) ) if failures >= effective_limit: continue @@ -5577,7 +5589,7 @@ def dispatch_once( if _crash_auto_blocked: result.auto_blocked.extend(_crash_auto_blocked) result.timed_out = enforce_max_runtime(conn) - result.promoted = recompute_ready(conn) + result.promoted = recompute_ready(conn, failure_limit=failure_limit) # Count tasks already running so max_spawn enforces concurrency rather # than a per-tick spawn budget. See the docstring above for the full diff --git a/tests/hermes_cli/test_kanban_blocked_sticky.py b/tests/hermes_cli/test_kanban_blocked_sticky.py index e6bd093d938..2d7cafef826 100644 --- a/tests/hermes_cli/test_kanban_blocked_sticky.py +++ b/tests/hermes_cli/test_kanban_blocked_sticky.py @@ -106,20 +106,30 @@ def test_worker_block_on_child_with_done_parents_is_still_sticky(kanban_home: Pa def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None: """A child that was put into ``blocked`` *without* a worker-issued - ``kanban_block`` (e.g. circuit-breaker after repeated spawn - failures, manual DB triage) must still get auto-promoted when its - parents complete — preserves the pre-#28712 recovery semantics.""" + ``kanban_block`` (e.g. a transient crash, manual DB triage) and whose + ``consecutive_failures`` is still *below* the circuit-breaker limit + must get auto-promoted when its parents complete — preserves the + pre-#28712 recovery semantics for genuinely transient failures. + + The complementary case — a block whose failure count has *reached* + the limit must stay blocked — is covered by + ``test_kanban_db.py::test_recompute_ready_skips_tasks_at_failure_limit`` + (#35072). Together they pin the contract: ``recompute_ready`` defers + the give-up decision to the same effective limit the breaker uses, so + the two never disagree. + """ with kb.connect() as conn: parent = kb.create_task(conn, title="parent") child = kb.create_task(conn, title="child", parents=[parent]) kb.complete_task(conn, parent, result="ok") - # Simulate a circuit-breaker / direct triage that flips status - # without emitting a ``blocked`` event — exactly what - # ``_record_task_failure`` does after a ``gave_up``. + # Simulate a transient circuit-breaker / direct triage that flips + # status without emitting a ``blocked`` event — exactly what + # ``_record_task_failure`` does below the limit. One failure is + # under the default limit (2), so recovery is still correct. conn.execute( - "UPDATE tasks SET status='blocked', consecutive_failures=5, " - "last_failure_error='persistent error' WHERE id=?", + "UPDATE tasks SET status='blocked', consecutive_failures=1, " + "last_failure_error='transient error' WHERE id=?", (child,), ) conn.commit() @@ -128,8 +138,9 @@ def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None: assert promoted == 1 task = kb.get_task(conn, child) assert task.status == "ready" - assert task.consecutive_failures == 0 - assert task.last_failure_error is None + # Counter is preserved across recovery (not reset) so the breaker + # can still accumulate if the task keeps failing (#35072). + assert task.consecutive_failures == 1 def test_gave_up_event_alone_does_not_make_block_sticky(kanban_home: Path) -> None: diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index 8e333189177..b2510855ea2 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -892,6 +892,73 @@ def test_recompute_ready_recovers_below_limit(kanban_home): assert task.consecutive_failures == 1 +def test_recompute_ready_honours_dispatcher_failure_limit(kanban_home): + """The guard's effective limit must follow the same resolution order + as the circuit breaker (#35072): per-task max_retries → dispatcher + failure_limit → DEFAULT_FAILURE_LIMIT. + + Without threading the dispatcher's ``kanban.failure_limit`` through, + the guard falls back to DEFAULT_FAILURE_LIMIT and disagrees with the + breaker — sticking a task prematurely (config limit > default) or + letting a tripped task escape (config limit < default). + """ + with kb.connect() as conn: + # Config allows MORE retries than the default. A task blocked + # with failures below the configured limit must still recover. + t = kb.create_task(conn, title="lenient", assignee="a") + conn.execute( + "UPDATE tasks SET status='blocked', consecutive_failures=? " + "WHERE id=?", + (kb.DEFAULT_FAILURE_LIMIT, t), + ) + conn.commit() + # Default-limit call would stick it (failures >= default). + assert kb.recompute_ready(conn) == 0 + assert kb.get_task(conn, t).status == "blocked" + # Dispatcher configured a higher limit → recover, preserve counter. + promoted = kb.recompute_ready( + conn, failure_limit=kb.DEFAULT_FAILURE_LIMIT + 2 + ) + assert promoted == 1 + task = kb.get_task(conn, t) + assert task.status == "ready" + assert task.consecutive_failures == kb.DEFAULT_FAILURE_LIMIT + + # Config allows FEWER retries than the default. A task at the + # stricter limit must stay blocked even though it's below default. + t2 = kb.create_task(conn, title="strict", assignee="a") + conn.execute( + "UPDATE tasks SET status='blocked', consecutive_failures=1 " + "WHERE id=?", + (t2,), + ) + conn.commit() + # Default-limit (2) would recover it (1 < 2). + # Stricter config limit (1) must keep it blocked (1 >= 1). + assert kb.recompute_ready(conn, failure_limit=1) == 0 + assert kb.get_task(conn, t2).status == "blocked" + + +def test_recompute_ready_per_task_max_retries_overrides_dispatcher(kanban_home): + """A per-task ``max_retries`` wins over the dispatcher failure_limit, + matching ``_record_task_failure``'s resolution order.""" + with kb.connect() as conn: + t = kb.create_task(conn, title="per-task", assignee="a") + # Per-task allows 4 retries; dispatcher config says 2. + conn.execute( + "UPDATE tasks SET status='blocked', consecutive_failures=2, " + "max_retries=4 WHERE id=?", + (t,), + ) + conn.commit() + # failures(2) < per-task limit(4) → recover, despite dispatcher=2. + promoted = kb.recompute_ready(conn, failure_limit=2) + assert promoted == 1 + task = kb.get_task(conn, t) + assert task.status == "ready" + assert task.consecutive_failures == 2 + + # --------------------------------------------------------------------------- # Parent-completion invariant at the claim gate (RCA t_a6acd07d) # --------------------------------------------------------------------------- From 14517ac1f5977f4d21e10153069eb52aac60311c Mon Sep 17 00:00:00 2001 From: LeonSGP43 Date: Sat, 30 May 2026 09:08:35 +0800 Subject: [PATCH 35/89] fix(update): export launcher virtualenv to uv --- hermes_cli/main.py | 5 +++- tests/hermes_cli/test_cmd_update.py | 39 +++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 93ca26e90e6..e59d3708473 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -8929,7 +8929,10 @@ def _cmd_update_pip(args): cmd = [sys.executable, "-m", "pip", "install", "--upgrade", "hermes-agent"] print(f"→ Running: {' '.join(cmd)}") - result = subprocess.run(cmd) + run_kwargs = {} + if sys.prefix != sys.base_prefix: + run_kwargs["env"] = {**os.environ, "VIRTUAL_ENV": sys.prefix} + result = subprocess.run(cmd, **run_kwargs) if result.returncode != 0: print("✗ Update failed") sys.exit(1) diff --git a/tests/hermes_cli/test_cmd_update.py b/tests/hermes_cli/test_cmd_update.py index 0cb8d033eb8..ed9033ffce2 100644 --- a/tests/hermes_cli/test_cmd_update.py +++ b/tests/hermes_cli/test_cmd_update.py @@ -39,6 +39,45 @@ def mock_args(): return SimpleNamespace() +class TestCmdUpdatePip: + """Regression tests for pip-install update flows.""" + + @patch("shutil.which", return_value="/usr/bin/uv") + @patch("subprocess.run") + def test_update_pip_exports_virtualenv_from_sys_prefix( + self, mock_run, _mock_which, mock_args, monkeypatch + ): + from hermes_cli import main as hm + + mock_run.return_value = subprocess.CompletedProcess([], 0, stdout="", stderr="") + monkeypatch.delenv("VIRTUAL_ENV", raising=False) + monkeypatch.setattr(hm.sys, "prefix", "/tmp/hermes-launcher-venv") + monkeypatch.setattr(hm.sys, "base_prefix", "/usr") + + hm._cmd_update_pip(mock_args) + + assert mock_run.call_count == 1 + assert mock_run.call_args.args[0] == ["/usr/bin/uv", "pip", "install", "--upgrade", "hermes-agent"] + assert mock_run.call_args.kwargs["env"]["VIRTUAL_ENV"] == "/tmp/hermes-launcher-venv" + + @patch("shutil.which", return_value="/usr/bin/uv") + @patch("subprocess.run") + def test_update_pip_does_not_export_virtualenv_for_system_python( + self, mock_run, _mock_which, mock_args, monkeypatch + ): + from hermes_cli import main as hm + + mock_run.return_value = subprocess.CompletedProcess([], 0, stdout="", stderr="") + monkeypatch.delenv("VIRTUAL_ENV", raising=False) + monkeypatch.setattr(hm.sys, "prefix", "/usr") + monkeypatch.setattr(hm.sys, "base_prefix", "/usr") + + hm._cmd_update_pip(mock_args) + + assert mock_run.call_count == 1 + assert "env" not in mock_run.call_args.kwargs + + class TestCmdUpdateBranchFallback: """cmd_update falls back to main when current branch has no remote counterpart.""" From 93e6a05efc615bed00e6f4d5737d5ada5f54b020 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 30 May 2026 01:41:33 -0700 Subject: [PATCH 36/89] feat(model-picker): group multi-endpoint providers under one row (#35227) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Inspired by Claude Code: /compress here [N] — boundary-aware 'summarize up to here' Adds a user-chosen compression boundary to the existing /compress command. /compress here [N] summarizes everything except the most recent N exchanges (default 2), which are preserved verbatim — letting the user pick the compression boundary instead of relying on the automatic token-budget heuristic. Inspired by Claude Code's Rewind 'Summarize up to here' action (v2.1.139, Week 20, May 2026): https://code.claude.com/docs/en/whats-new/2026-w20 - hermes_cli/partial_compress.py: pure split/parse helpers + seam-alternation guard (shared by CLI and gateway). - cli.py / gateway/run.py: route 'here [N]' / '--keep N' to partial compression; compress only the head, re-append the verbatim tail through the seam guard. - Preserves message-flow role alternation (seam guard merges any illegal user->user / assistant->assistant adjacency). - Reuses the existing _compress_context session-rotation/lock machinery — no changes to the compression core. - Bare /compress (full) and /compress behavior unchanged. Tests: 12 helper unit tests + 5 CLI integration tests + E2E (interleaved tool-call transcript, degenerate/multimodal seams, real handler path). * feat(model-picker): group multi-endpoint providers under one row The interactive provider pickers (hermes model, setup wizard, Telegram /model) listed every provider slug flat, so vendors with several endpoints (Kimi/Moonshot, MiniMax, xAI Grok, Google Gemini, OpenAI, OpenCode, GitHub Copilot) each occupied multiple top-level rows. Now related slugs fold into one top-level row that drills down to the specific endpoint. - models.py: add PROVIDER_GROUPS table + group_providers() fold (display only — CANONICAL_PROVIDERS, slugs, --provider, /model all unchanged and individually addressable). - hermes model (main.py): group rows drill into a member sub-picker, then dispatch to the existing _model_flow_* unchanged. setup wizard inherits it. - Telegram /model: new mpg: callback expands to member mp: buttons; single authenticated member degrades to a direct button. - Grouping is the single shared fold across all three surfaces. Validation: 163 targeted tests pass; E2E confirms group->member->model resolves to the correct concrete slug for all families. --- gateway/platforms/telegram.py | 111 ++++++++++++++---- hermes_cli/main.py | 87 ++++++++++++--- hermes_cli/models.py | 99 ++++++++++++++++ tests/gateway/test_telegram_model_picker.py | 72 ++++++++++++ tests/hermes_cli/test_provider_groups.py | 118 ++++++++++++++++++++ 5 files changed, 449 insertions(+), 38 deletions(-) create mode 100644 tests/hermes_cli/test_provider_groups.py diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 026d8151ceb..7b4d00e818f 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -2804,21 +2804,8 @@ class TelegramAdapter(BasePlatformAdapter): return slug try: - # Build provider buttons — 2 per row - buttons: list = [] - for p in providers: - count = p.get("total_models", len(p.get("models", []))) - label = f"{p['name']} ({count})" - if p.get("is_current"): - label = f"✓ {label}" - # Compact callback data: mp: (max 64 bytes) - buttons.append( - InlineKeyboardButton(label, callback_data=f"mp:{p['slug']}") - ) - - rows = [buttons[i : i + 2] for i in range(0, len(buttons), 2)] - rows.append([InlineKeyboardButton("✗ Cancel", callback_data="mx")]) - keyboard = InlineKeyboardMarkup(rows) + # Build provider buttons — folds provider groups (display only). + keyboard = self._build_provider_keyboard(providers) provider_label = get_label(current_provider) text = self.format_message( @@ -2865,6 +2852,56 @@ class TelegramAdapter(BasePlatformAdapter): _MODEL_PAGE_SIZE = 8 + def _build_provider_keyboard(self, providers: list): + """Build the top-level provider keyboard, folding provider groups. + + Provider families (Kimi/Moonshot, MiniMax, xAI Grok, ...) collapse to + a single ``mpg:`` button; tapping it drills into a member + sub-keyboard. Single providers (and groups with only one authenticated + member) render as direct ``mp:`` buttons. Grouping mirrors the + CLI ``hermes model`` picker via the shared ``group_providers`` fold, + so all surfaces stay consistent. + """ + try: + from hermes_cli.models import group_providers + except Exception: + group_providers = None + + by_slug = {p.get("slug"): p for p in providers} + + def _provider_button(p): + count = p.get("total_models", len(p.get("models", []))) + label = f"{p['name']} ({count})" + if p.get("is_current"): + label = f"✓ {label}" + return InlineKeyboardButton(label, callback_data=f"mp:{p['slug']}") + + buttons: list = [] + if group_providers is not None: + for row in group_providers([p.get("slug") for p in providers]): + if row["kind"] == "group": + members = [by_slug[m] for m in row["members"] if m in by_slug] + count = sum( + m.get("total_models", len(m.get("models", []))) for m in members + ) + label = f"{row['label']} ▸ ({count})" + if any(m.get("is_current") for m in members): + label = f"✓ {label}" + buttons.append( + InlineKeyboardButton(label, callback_data=f"mpg:{row['group_id']}") + ) + else: + p = by_slug.get(row["slug"]) + if p is not None: + buttons.append(_provider_button(p)) + else: + for p in providers: + buttons.append(_provider_button(p)) + + rows = [buttons[i : i + 2] for i in range(0, len(buttons), 2)] + rows.append([InlineKeyboardButton("✗ Cancel", callback_data="mx")]) + return InlineKeyboardMarkup(rows) + def _build_model_keyboard(self, models: list, page: int) -> tuple: """Build paginated model buttons. Returns (keyboard, page_info_text).""" page_size = self._MODEL_PAGE_SIZE @@ -3043,10 +3080,23 @@ class TelegramAdapter(BasePlatformAdapter): # Clean up state self._model_picker_state.pop(chat_id, None) - elif data == "mb": - # --- Back to provider list --- + elif data.startswith("mpg:"): + # --- Provider group selected: show member providers --- + group_id = data[4:] + try: + from hermes_cli.models import PROVIDER_GROUPS + _label, member_slugs = PROVIDER_GROUPS.get(group_id, ("", [])) + except Exception: + _label, member_slugs = "", [] + + by_slug = {p["slug"]: p for p in state["providers"]} + members = [by_slug[m] for m in member_slugs if m in by_slug] + if not members: + await query.answer(text="Group not found.") + return + buttons = [] - for p in state["providers"]: + for p in members: count = p.get("total_models", len(p.get("models", []))) label = f"{p['name']} ({count})" if p.get("is_current"): @@ -3054,11 +3104,30 @@ class TelegramAdapter(BasePlatformAdapter): buttons.append( InlineKeyboardButton(label, callback_data=f"mp:{p['slug']}") ) - rows = [buttons[i : i + 2] for i in range(0, len(buttons), 2)] - rows.append([InlineKeyboardButton("✗ Cancel", callback_data="mx")]) + rows.append([ + InlineKeyboardButton("◀ Back", callback_data="mb"), + InlineKeyboardButton("✗ Cancel", callback_data="mx"), + ]) keyboard = InlineKeyboardMarkup(rows) + await query.edit_message_text( + text=self.format_message( + ( + f"⚙ *Model Configuration*\n\n" + f"Provider family: *{_label or group_id}*\n\n" + f"Select a provider:" + ) + ), + parse_mode=ParseMode.MARKDOWN_V2, + reply_markup=keyboard, + ) + await query.answer() + + elif data == "mb": + # --- Back to provider list (folds groups) --- + keyboard = self._build_provider_keyboard(state["providers"]) + try: provider_label = get_label(state["current_provider"]) except Exception: @@ -3107,7 +3176,7 @@ class TelegramAdapter(BasePlatformAdapter): query_user_name = getattr(query.from_user, "first_name", None) # --- Model picker callbacks --- - if data.startswith(("mp:", "mm:", "mb", "mx", "mg:")): + if data.startswith(("mp:", "mpg:", "mm:", "mb", "mx", "mg:")): chat_id = str(query.message.chat_id) if query.message else None if chat_id: await self._handle_model_picker_callback(query, data, chat_id) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index e59d3708473..165866cc67e 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -2394,7 +2394,12 @@ def select_provider_and_model(args=None): if active == "openrouter" and get_env_value("OPENAI_BASE_URL"): active = "custom" - from hermes_cli.models import CANONICAL_PROVIDERS, _PROVIDER_LABELS + from hermes_cli.models import ( + CANONICAL_PROVIDERS, + _PROVIDER_LABELS, + group_providers, + provider_group_for_slug, + ) provider_labels = dict(_PROVIDER_LABELS) # derive from canonical list if active and active in _custom_provider_map: @@ -2407,8 +2412,43 @@ def select_provider_and_model(args=None): print(f" Active provider: {active_label}") print() - # Step 1: Provider selection — flat list from CANONICAL_PROVIDERS - all_providers = [(p.slug, p.tui_desc) for p in CANONICAL_PROVIDERS] + # Step 1: Provider selection. + # + # Canonical providers are folded into top-level groups (display only — see + # PROVIDER_GROUPS in hermes_cli/models.py). A multi-member group shows one + # row ("Kimi / Moonshot ▸"); picking it opens a member sub-picker that + # resolves back to a concrete slug, so the dispatch chain below is + # unchanged. Custom providers and the trailing actions stay flat. + canonical_descs = {p.slug: p.tui_desc for p in CANONICAL_PROVIDERS} + grouped_rows = group_providers([p.slug for p in CANONICAL_PROVIDERS]) + + # The group/slug that should be pre-selected: the active provider's group + # if it's grouped, otherwise the active slug itself. + active_group = provider_group_for_slug(active) if active else "" + + # ordered entries: (key, label, members) + # members == [] → leaf row, key is a provider slug / action + # members != [] → group row, key is "group:" + ordered: list[tuple[str, str, list[str]]] = [] + default_idx = 0 + for row in grouped_rows: + if row["kind"] == "group": + gid = row["group_id"] + label = f"{row['label']} ▸" + key = f"group:{gid}" + is_active = bool(active_group) and gid == active_group + members = row["members"] + else: + slug = row["slug"] + label = canonical_descs.get(slug, provider_labels.get(slug, slug)) + key = slug + is_active = bool(active) and slug == active + members = [] + if is_active: + ordered.append((key, f"{label} ← currently active", members)) + default_idx = len(ordered) - 1 + else: + ordered.append((key, label, members)) for key, provider_info in _custom_provider_map.items(): name = provider_info["name"] @@ -2416,36 +2456,49 @@ def select_provider_and_model(args=None): short_url = base_url.replace("https://", "").replace("http://", "").rstrip("/") saved_model = provider_info.get("model", "") model_hint = f" — {saved_model}" if saved_model else "" - all_providers.append((key, f"{name} ({short_url}){model_hint}")) - - # Build the menu - ordered = [] - default_idx = 0 - for key, label in all_providers: + label = f"{name} ({short_url}){model_hint}" if active and key == active: - ordered.append((key, f"{label} ← currently active")) + ordered.append((key, f"{label} ← currently active", [])) default_idx = len(ordered) - 1 else: - ordered.append((key, label)) + ordered.append((key, label, [])) - ordered.append(("custom", "Custom endpoint (enter URL manually)")) + ordered.append(("custom", "Custom endpoint (enter URL manually)", [])) _has_saved_custom_list = isinstance(config.get("custom_providers"), list) and bool( config.get("custom_providers") ) if _has_saved_custom_list: - ordered.append(("remove-custom", "Remove a saved custom provider")) - ordered.append(("aux-config", "Configure auxiliary models...")) - ordered.append(("cancel", "Leave unchanged")) + ordered.append(("remove-custom", "Remove a saved custom provider", [])) + ordered.append(("aux-config", "Configure auxiliary models...", [])) + ordered.append(("cancel", "Leave unchanged", [])) provider_idx = _prompt_provider_choice( - [label for _, label in ordered], + [label for _, label, _ in ordered], default=default_idx, ) if provider_idx is None or ordered[provider_idx][0] == "cancel": print("No change.") return - selected_provider = ordered[provider_idx][0] + selected_key = ordered[provider_idx][0] + selected_members = ordered[provider_idx][2] + + # Group row → drill into a member sub-picker. Default to the active member + # if the active provider lives in this group. + if selected_members: + member_default = 0 + if active in selected_members: + member_default = selected_members.index(active) + member_labels = [ + canonical_descs.get(m, provider_labels.get(m, m)) for m in selected_members + ] + member_idx = _prompt_provider_choice(member_labels, default=member_default) + if member_idx is None: + print("No change.") + return + selected_provider = selected_members[member_idx] + else: + selected_provider = selected_key if selected_provider == "aux-config": _aux_config_menu() diff --git a/hermes_cli/models.py b/hermes_cli/models.py index 42eadfd7629..fba6ec94cfd 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -936,6 +936,105 @@ _PROVIDER_LABELS = {p.slug: p.label for p in CANONICAL_PROVIDERS} _PROVIDER_LABELS["custom"] = "Custom endpoint" # special case: not a named provider +# --------------------------------------------------------------------------- +# Provider groups — DISPLAY ONLY +# +# Some vendors expose several Hermes provider slugs (one per endpoint / +# auth method: global API, China API, OAuth coding plan, ...). Listing every +# slug as a top-level row in the interactive `hermes model` / setup wizard / +# Telegram `/model` pickers makes that list long and noisy. +# +# These groups fold related slugs under one top-level row in INTERACTIVE +# PICKERS only. They do NOT change ``CANONICAL_PROVIDERS``, slug identity, +# the ``--provider`` flag, ``/model ``, or any typed path — +# every member slug remains individually addressable. Grouping is a pure +# display affordance; ``group_providers()`` is the single fold used by all +# three picker surfaces so they stay consistent. +# +# group_id -> (display_label, [member_slug, ...]) +# +# Member order is the order shown inside the group submenu. +# --------------------------------------------------------------------------- +PROVIDER_GROUPS: dict[str, tuple[str, list[str]]] = { + "kimi": ("Kimi / Moonshot", ["kimi-coding", "kimi-coding-cn"]), + "minimax": ("MiniMax", ["minimax", "minimax-oauth", "minimax-cn"]), + "xai": ("xAI Grok", ["xai", "xai-oauth"]), + "google": ("Google Gemini", ["gemini", "google-gemini-cli"]), + "openai": ("OpenAI", ["openai-codex", "openai-api"]), + "opencode": ("OpenCode", ["opencode-zen", "opencode-go"]), + "copilot": ("GitHub Copilot", ["copilot", "copilot-acp"]), +} + +# Reverse index: member slug -> group_id. Built once at import. +_SLUG_TO_GROUP: dict[str, str] = { + slug: gid for gid, (_label, members) in PROVIDER_GROUPS.items() for slug in members +} + + +def provider_group_for_slug(slug: str) -> str: + """Return the group_id a provider slug belongs to, or "" if ungrouped.""" + return _SLUG_TO_GROUP.get(str(slug or "").strip().lower(), "") + + +def group_providers(slugs): + """Fold a flat ordered slug iterable into picker rows by provider group. + + DISPLAY ONLY. Used by every interactive picker (``hermes model``, the + setup wizard, the Telegram ``/model`` keyboard) so grouping is identical + across surfaces. + + Each returned row is a dict:: + + {"kind": "single", "slug": } # ungrouped, or + # 1-member group + {"kind": "group", "group_id": , "label":