diff --git a/acp_adapter/server.py b/acp_adapter/server.py index f8dade72af..dd9d75af9c 100644 --- a/acp_adapter/server.py +++ b/acp_adapter/server.py @@ -4,6 +4,7 @@ from __future__ import annotations import asyncio import contextvars +import json import logging import os from collections import defaultdict, deque @@ -47,6 +48,7 @@ from acp.schema import ( TextContentBlock, UnstructuredCommandInput, Usage, + UsageUpdate, UserMessageChunk, ) @@ -65,6 +67,7 @@ from acp_adapter.events import ( ) from acp_adapter.permissions import make_approval_callback from acp_adapter.session import SessionManager, SessionState, _expand_acp_enabled_toolsets +from acp_adapter.tools import build_tool_complete, build_tool_start logger = logging.getLogger(__name__) @@ -315,6 +318,66 @@ class HermesACPAgent(acp.Agent): return target_provider, new_model + @staticmethod + def _build_usage_update(state: SessionState) -> UsageUpdate | None: + """Build ACP native context-usage data for clients like Zed. + + Zed's circular context indicator is driven by ACP ``usage_update`` + session updates: ``size`` is the model context window and ``used`` is + the current request pressure. Hermes estimates ``used`` from the same + buckets it sends to providers: system prompt, conversation history, and + tool schemas. + """ + agent = state.agent + compressor = getattr(agent, "context_compressor", None) + size = int(getattr(compressor, "context_length", 0) or 0) + if size <= 0: + return None + + try: + from agent.model_metadata import estimate_request_tokens_rough + + used = estimate_request_tokens_rough( + state.history, + system_prompt=getattr(agent, "_cached_system_prompt", "") or "", + tools=getattr(agent, "tools", None) or None, + ) + except Exception: + logger.debug("Could not estimate ACP native context usage", exc_info=True) + used = int(getattr(compressor, "last_prompt_tokens", 0) or 0) + + return UsageUpdate( + session_update="usage_update", + size=max(size, 0), + used=max(used, 0), + ) + + async def _send_usage_update(self, state: SessionState) -> None: + """Send ACP native context usage to the connected client.""" + if not self._conn: + return + update = self._build_usage_update(state) + if update is None: + return + try: + await self._conn.session_update( + session_id=state.session_id, + update=update, + ) + except Exception: + logger.warning( + "Failed to send ACP usage update for session %s", + state.session_id, + exc_info=True, + ) + + def _schedule_usage_update(self, state: SessionState) -> None: + """Schedule native context indicator refresh after ACP responses.""" + if not self._conn: + return + loop = asyncio.get_running_loop() + loop.call_soon(asyncio.create_task, self._send_usage_update(state)) + async def _register_session_mcp_servers( self, state: SessionState, @@ -485,37 +548,99 @@ class HermesACPAgent(acp.Agent): ) return None + @staticmethod + def _history_tool_call_name_args(tool_call: dict[str, Any]) -> tuple[str, dict[str, Any]]: + """Extract function name/arguments from an OpenAI-style tool_call.""" + function = tool_call.get("function") if isinstance(tool_call.get("function"), dict) else {} + name = str(function.get("name") or tool_call.get("name") or "unknown_tool") + raw_args = function.get("arguments") or tool_call.get("arguments") or tool_call.get("args") or {} + if isinstance(raw_args, str): + try: + parsed = json.loads(raw_args) + except Exception: + parsed = {"raw": raw_args} + raw_args = parsed + if not isinstance(raw_args, dict): + raw_args = {} + return name, raw_args + + @staticmethod + def _history_tool_call_id(tool_call: dict[str, Any]) -> str: + """Return the stable provider tool call id for ACP history replay.""" + return str( + tool_call.get("id") + or tool_call.get("call_id") + or tool_call.get("tool_call_id") + or "" + ).strip() + async def _replay_session_history(self, state: SessionState) -> None: """Send persisted user/assistant history to clients during session/load. Zed's ACP history UI calls ``session/load`` after the user picks an item from the Agents sidebar. The agent must then replay the full conversation - as ``user_message_chunk`` / ``agent_message_chunk`` notifications; merely - restoring server-side state makes Hermes remember context, but leaves the - editor looking like a clean thread. + as user/assistant chunks plus reconstructed tool-call start/completion + notifications; merely restoring server-side state makes Hermes remember + context, but leaves the editor looking like a clean thread. """ if not self._conn or not state.history: return - for message in state.history: - role = str(message.get("role") or "") - if role not in {"user", "assistant"}: - continue - text = self._history_message_text(message) - if not text: - continue - update = self._history_message_update(role=role, text=text) - if update is None: - continue + active_tool_calls: dict[str, tuple[str, dict[str, Any]]] = {} + + async def _send(update: Any) -> bool: try: await self._conn.session_update(session_id=state.session_id, update=update) + return True except Exception: logger.warning( "Failed to replay ACP history for session %s", state.session_id, exc_info=True, ) - return + return False + + for message in state.history: + role = str(message.get("role") or "") + + if role in {"user", "assistant"}: + text = self._history_message_text(message) + if text: + update = self._history_message_update(role=role, text=text) + if update is not None and not await _send(update): + return + + if role == "assistant" and isinstance(message.get("tool_calls"), list): + for tool_call in message["tool_calls"]: + if not isinstance(tool_call, dict): + continue + tool_call_id = self._history_tool_call_id(tool_call) + if not tool_call_id: + continue + tool_name, args = self._history_tool_call_name_args(tool_call) + active_tool_calls[tool_call_id] = (tool_name, args) + if not await _send(build_tool_start(tool_call_id, tool_name, args)): + return + continue + + if role == "tool": + tool_call_id = str(message.get("tool_call_id") or "").strip() + tool_name = str(message.get("tool_name") or "").strip() + function_args: dict[str, Any] | None = None + if tool_call_id in active_tool_calls: + tool_name, function_args = active_tool_calls.pop(tool_call_id) + if not tool_call_id or not tool_name: + continue + result = message.get("content") + if not await _send( + build_tool_complete( + tool_call_id, + tool_name, + result=result if isinstance(result, str) else None, + function_args=function_args, + ) + ): + return async def new_session( self, @@ -527,11 +652,24 @@ class HermesACPAgent(acp.Agent): await self._register_session_mcp_servers(state, mcp_servers) logger.info("New session %s (cwd=%s)", state.session_id, cwd) self._schedule_available_commands_update(state.session_id) + self._schedule_usage_update(state) return NewSessionResponse( session_id=state.session_id, models=self._build_model_state(state), ) + def _schedule_history_replay(self, state: SessionState) -> None: + """Replay persisted history after session/load or session/resume returns. + + Zed only attaches streamed transcript/tool updates once the load/resume + response has completed. Sending replay notifications while the request is + still in-flight can make the server look correct in logs while the editor + drops or fails to attach the tool-call history. + """ + loop = asyncio.get_running_loop() + replay_coro = self._replay_session_history(state) + loop.call_soon(asyncio.create_task, replay_coro) + async def load_session( self, cwd: str, @@ -545,8 +683,9 @@ class HermesACPAgent(acp.Agent): return None await self._register_session_mcp_servers(state, mcp_servers) logger.info("Loaded session %s", session_id) - await self._replay_session_history(state) + self._schedule_history_replay(state) self._schedule_available_commands_update(session_id) + self._schedule_usage_update(state) return LoadSessionResponse(models=self._build_model_state(state)) async def resume_session( @@ -562,8 +701,9 @@ class HermesACPAgent(acp.Agent): state = self.session_manager.create_session(cwd=cwd) await self._register_session_mcp_servers(state, mcp_servers) logger.info("Resumed session %s", state.session_id) - await self._replay_session_history(state) + self._schedule_history_replay(state) self._schedule_available_commands_update(state.session_id) + self._schedule_usage_update(state) return ResumeSessionResponse(models=self._build_model_state(state)) async def cancel(self, session_id: str, **kwargs: Any) -> None: @@ -712,6 +852,7 @@ class HermesACPAgent(acp.Agent): if self._conn: update = acp.update_agent_message_text(response_text) await self._conn.session_update(session_id, update) + await self._send_usage_update(state) return PromptResponse(stop_reason="end_turn") # If Zed sends another regular prompt while the same ACP session is @@ -744,24 +885,37 @@ class HermesACPAgent(acp.Agent): tool_call_meta: dict[str, dict[str, Any]] = {} previous_approval_cb = None + streamed_message = False + if conn: tool_progress_cb = make_tool_progress_cb(conn, session_id, loop, tool_call_ids, tool_call_meta) - thinking_cb = make_thinking_cb(conn, session_id, loop) + reasoning_cb = make_thinking_cb(conn, session_id, loop) step_cb = make_step_cb(conn, session_id, loop, tool_call_ids, tool_call_meta) message_cb = make_message_cb(conn, session_id, loop) + + def stream_delta_cb(text: str) -> None: + nonlocal streamed_message + if text: + streamed_message = True + message_cb(text) + approval_cb = make_approval_callback(conn.request_permission, loop, session_id) else: tool_progress_cb = None - thinking_cb = None + reasoning_cb = None step_cb = None - message_cb = None + stream_delta_cb = None approval_cb = None agent = state.agent agent.tool_progress_callback = tool_progress_cb - agent.thinking_callback = thinking_cb + # ACP thought panes should not receive Hermes' local kawaii waiting/status + # updates. Route provider/model reasoning deltas instead; if the provider + # emits no reasoning, Zed should not get a fake "thinking" accordion. + agent.thinking_callback = None + agent.reasoning_callback = reasoning_cb agent.step_callback = step_cb - agent.message_callback = message_cb + agent.stream_delta_callback = stream_delta_cb # Approval callback is per-thread (thread-local, GHSA-qg5c-hvr5-hjgr). # Set it INSIDE _run_agent so the TLS write happens in the executor @@ -867,7 +1021,7 @@ class HermesACPAgent(acp.Agent): ) except Exception: logger.debug("Failed to auto-title ACP session %s", session_id, exc_info=True) - if final_response and conn: + if final_response and conn and not streamed_message: update = acp.update_agent_message_text(final_response) await conn.session_update(session_id, update) @@ -903,6 +1057,8 @@ class HermesACPAgent(acp.Agent): cached_read_tokens=result.get("cache_read_tokens"), ) + await self._send_usage_update(state) + stop_reason = "cancelled" if state.cancel_event and state.cancel_event.is_set() else "end_turn" return PromptResponse(stop_reason=stop_reason, usage=usage) @@ -1035,22 +1191,84 @@ class HermesACPAgent(acp.Agent): return f"Could not list tools: {e}" def _cmd_context(self, args: str, state: SessionState) -> str: + """Show ACP session context pressure and compression guidance.""" n_messages = len(state.history) - if n_messages == 0: - return "Conversation is empty (no messages yet)." - # Count by role + + # Count by role. roles: dict[str, int] = {} for msg in state.history: role = msg.get("role", "unknown") roles[role] = roles.get(role, 0) + 1 + + agent = state.agent + model = state.model or getattr(agent, "model", "") + provider = getattr(agent, "provider", None) or "auto" + compressor = getattr(agent, "context_compressor", None) + context_length = int(getattr(compressor, "context_length", 0) or 0) + threshold_tokens = int(getattr(compressor, "threshold_tokens", 0) or 0) + + try: + from agent.model_metadata import estimate_request_tokens_rough + + system_prompt = getattr(agent, "_cached_system_prompt", "") or "" + tools = getattr(agent, "tools", None) or None + approx_tokens = estimate_request_tokens_rough( + state.history, + system_prompt=system_prompt, + tools=tools, + ) + except Exception: + logger.debug("Could not estimate ACP context usage", exc_info=True) + approx_tokens = 0 + + if threshold_tokens <= 0 and context_length > 0: + threshold_tokens = int(context_length * 0.80) + lines = [ - f"Conversation: {n_messages} messages", + f"Conversation: {n_messages} messages" + if n_messages + else "Conversation is empty (no messages yet).", f" user: {roles.get('user', 0)}, assistant: {roles.get('assistant', 0)}, " f"tool: {roles.get('tool', 0)}, system: {roles.get('system', 0)}", ] - model = state.model or getattr(state.agent, "model", "") if model: lines.append(f"Model: {model}") + lines.append(f"Provider: {provider}") + + if approx_tokens > 0: + if context_length > 0: + usage_pct = (approx_tokens / context_length) * 100 + lines.append( + f"Context usage: ~{approx_tokens:,} / {context_length:,} tokens ({usage_pct:.1f}%)" + ) + else: + lines.append(f"Context usage: ~{approx_tokens:,} tokens") + + if threshold_tokens > 0: + if approx_tokens > 0: + threshold_pct = (threshold_tokens / context_length) * 100 if context_length > 0 else 0 + remaining = max(threshold_tokens - approx_tokens, 0) + if approx_tokens >= threshold_tokens: + lines.append( + f"Compression: due now (threshold ~{threshold_tokens:,}" + + (f", {threshold_pct:.0f}%" if threshold_pct else "") + + "). Run /compact." + ) + else: + lines.append( + f"Compression: ~{remaining:,} tokens until threshold " + f"(~{threshold_tokens:,}" + + (f", {threshold_pct:.0f}%" if threshold_pct else "") + + ")." + ) + else: + lines.append(f"Compression threshold: ~{threshold_tokens:,} tokens") + + if getattr(agent, "compression_enabled", True) is False: + lines.append("Compression is disabled for this agent.") + else: + lines.append("Tip: run /compact to compress manually before the threshold.") + return "\n".join(lines) def _cmd_reset(self, args: str, state: SessionState) -> str: diff --git a/acp_adapter/tools.py b/acp_adapter/tools.py index 067652106e..e7e53a6277 100644 --- a/acp_adapter/tools.py +++ b/acp_adapter/tools.py @@ -28,6 +28,11 @@ TOOL_KIND_MAP: Dict[str, ToolKind] = { "terminal": "execute", "process": "execute", "execute_code": "execute", + # Session/meta tools + "todo": "other", + "skill_view": "read", + "skills_list": "read", + "skill_manage": "edit", # Web / fetch "web_search": "fetch", "web_extract": "fetch", @@ -51,6 +56,28 @@ TOOL_KIND_MAP: Dict[str, ToolKind] = { } +_POLISHED_TOOLS = { + # Core operator loop + "todo", "memory", "session_search", "delegate_task", + # Files / execution + "read_file", "write_file", "patch", "search_files", "terminal", "process", "execute_code", + # Skills / web / browser / media + "skill_view", "skills_list", "skill_manage", "web_search", "web_extract", + "browser_navigate", "browser_click", "browser_type", "browser_press", "browser_scroll", + "browser_back", "browser_snapshot", "browser_console", "browser_get_images", "browser_vision", + "vision_analyze", "image_generate", "text_to_speech", + # Schedulers / platform integrations + "cronjob", "send_message", "clarify", "discord", "discord_admin", + "ha_list_entities", "ha_get_state", "ha_list_services", "ha_call_service", + "feishu_doc_read", "feishu_drive_list_comments", "feishu_drive_list_comment_replies", + "feishu_drive_reply_comment", "feishu_drive_add_comment", + "kanban_create", "kanban_show", "kanban_comment", "kanban_complete", + "kanban_block", "kanban_link", "kanban_heartbeat", + "yb_query_group_info", "yb_query_group_members", "yb_search_sticker", + "yb_send_dm", "yb_send_sticker", "mixture_of_agents", +} + + def get_tool_kind(tool_name: str) -> ToolKind: """Return the ACP ToolKind for a hermes tool, defaulting to 'other'.""" return TOOL_KIND_MAP.get(tool_name, "other") @@ -85,18 +112,645 @@ def build_tool_title(tool_name: str, args: Dict[str, Any]) -> str: if urls: return f"extract: {urls[0]}" + (f" (+{len(urls)-1})" if len(urls) > 1 else "") return "web extract" + if tool_name == "process": + action = str(args.get("action") or "").strip() or "manage" + sid = str(args.get("session_id") or "").strip() + return f"process {action}: {sid}" if sid else f"process {action}" if tool_name == "delegate_task": + tasks = args.get("tasks") + if isinstance(tasks, list) and tasks: + return f"delegate batch ({len(tasks)} tasks)" goal = args.get("goal", "") if goal and len(goal) > 60: goal = goal[:57] + "..." return f"delegate: {goal}" if goal else "delegate task" + if tool_name == "session_search": + query = str(args.get("query") or "").strip() + return f"session search: {query}" if query else "recent sessions" + if tool_name == "memory": + action = str(args.get("action") or "manage").strip() or "manage" + target = str(args.get("target") or "memory").strip() or "memory" + return f"memory {action}: {target}" if tool_name == "execute_code": - return "execute code" + code = str(args.get("code") or "").strip() + first_line = next((line.strip() for line in code.splitlines() if line.strip()), "") + if first_line: + if len(first_line) > 70: + first_line = first_line[:67] + "..." + return f"python: {first_line}" + return "python code" + if tool_name == "todo": + items = args.get("todos") + if isinstance(items, list): + return f"todo ({len(items)} item{'s' if len(items) != 1 else ''})" + return "todo" + if tool_name == "skill_view": + name = str(args.get("name") or "?").strip() or "?" + file_path = str(args.get("file_path") or "").strip() + suffix = f"/{file_path}" if file_path else "" + return f"skill view ({name}{suffix})" + if tool_name == "skills_list": + category = str(args.get("category") or "").strip() + return f"skills list ({category})" if category else "skills list" + if tool_name == "skill_manage": + action = str(args.get("action") or "manage").strip() or "manage" + name = str(args.get("name") or "?").strip() or "?" + file_path = str(args.get("file_path") or "").strip() + target = f"{name}/{file_path}" if file_path else name + if len(target) > 64: + target = target[:61] + "..." + return f"skill {action}: {target}" + if tool_name == "browser_navigate": + return f"navigate: {args.get('url', '?')}" + if tool_name == "browser_snapshot": + return "browser snapshot" + if tool_name == "browser_vision": + return f"browser vision: {str(args.get('question', '?'))[:50]}" + if tool_name == "browser_get_images": + return "browser images" if tool_name == "vision_analyze": - return f"analyze image: {args.get('question', '?')[:50]}" + return f"analyze image: {str(args.get('question', '?'))[:50]}" + if tool_name == "image_generate": + prompt = str(args.get("prompt") or args.get("description") or "").strip() + return f"generate image: {prompt[:50]}" if prompt else "generate image" + if tool_name == "cronjob": + action = str(args.get("action") or "manage").strip() or "manage" + job_id = str(args.get("job_id") or args.get("id") or "").strip() + return f"cron {action}: {job_id}" if job_id else f"cron {action}" return tool_name +def _text(content: str) -> Any: + return acp.tool_content(acp.text_block(content)) + + +def _json_loads_maybe(value: Optional[str]) -> Any: + if not isinstance(value, str): + return value + try: + return json.loads(value) + except Exception: + pass + + # Some Hermes tools append a human hint after a JSON payload, e.g. + # ``{...}\n\n[Hint: Results truncated...]``. Keep the structured rendering path + # by decoding the first JSON value instead of falling back to raw text. + try: + decoded, _ = json.JSONDecoder().raw_decode(value.lstrip()) + return decoded + except Exception: + return None + + +def _truncate_text(text: str, limit: int = 5000) -> str: + if len(text) <= limit: + return text + return text[: max(0, limit - 100)] + f"\n... ({len(text)} chars total, truncated)" + + +def _fenced_text(text: str, language: str = "") -> str: + """Return a Markdown fence that cannot be broken by backticks in text.""" + longest = max((len(run) for run in text.split("`")[1::2]), default=0) + fence = "`" * max(3, longest + 1) + return f"{fence}{language}\n{text}\n{fence}" + + +def _format_todo_result(result: Optional[str]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict) or not isinstance(data.get("todos"), list): + return None + summary = data.get("summary") if isinstance(data.get("summary"), dict) else {} + icon = { + "completed": "โœ…", + "in_progress": "๐Ÿ”„", + "pending": "โณ", + "cancelled": "โœ—", + } + lines = ["**Todo list**", ""] + for item in data["todos"]: + if not isinstance(item, dict): + continue + status = str(item.get("status") or "pending") + content = str(item.get("content") or item.get("id") or "").strip() + if content: + lines.append(f"- {icon.get(status, 'โ€ข')} {content}") + if summary: + cancelled = summary.get("cancelled", 0) + lines.extend([ + "", + "**Progress:** " + f"{summary.get('completed', 0)} completed, " + f"{summary.get('in_progress', 0)} in progress, " + f"{summary.get('pending', 0)} pending" + + (f", {cancelled} cancelled" if cancelled else ""), + ]) + return "\n".join(lines) + + +def _format_read_file_result(result: Optional[str], args: Optional[Dict[str, Any]]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return None + if data.get("error") and not data.get("content"): + return f"Read failed: {data.get('error')}" + content = data.get("content") + if not isinstance(content, str): + return None + path = str((args or {}).get("path") or data.get("path") or "file").strip() + offset = (args or {}).get("offset") + limit = (args or {}).get("limit") + range_bits = [] + if offset: + range_bits.append(f"from line {offset}") + if limit: + range_bits.append(f"limit {limit}") + suffix = f" ({', '.join(range_bits)})" if range_bits else "" + header = f"Read {path}{suffix}" + if data.get("total_lines") is not None: + header += f" โ€” {data.get('total_lines')} total lines" + # Hermes read_file output is line-numbered with `|`. If we send it as raw + # Markdown, Zed can interpret pipes as tables and collapse the layout. + # Fence the payload so file lines stay readable and literal. + return _truncate_text(f"{header}\n\n{_fenced_text(content)}") + + +def _format_search_files_result(result: Optional[str]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return None + matches = data.get("matches") + if not isinstance(matches, list): + return None + + total = data.get("total_count", len(matches)) + shown = min(len(matches), 12) + truncated = bool(data.get("truncated")) or len(matches) > shown + lines = [ + "Search results", + f"Found {total} match{'es' if total != 1 else ''}; showing {shown}.", + "", + ] + + for match in matches[:shown]: + if not isinstance(match, dict): + lines.append(f"- {match}") + continue + + path = str(match.get("path") or match.get("file") or match.get("filename") or "?") + line = match.get("line") or match.get("line_number") + content = str(match.get("content") or match.get("text") or "").strip() + loc = f"{path}:{line}" if line else path + lines.append(f"- {loc}") + if content: + snippet = _truncate_text(" ".join(content.split()), 300) + lines.append(f" {snippet}") + + if truncated: + lines.extend([ + "", + "Results truncated. Narrow the search, add file_glob, or use offset to page.", + ]) + return _truncate_text("\n".join(lines), limit=7000) + + +def _format_execute_code_result(result: Optional[str]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return result if isinstance(result, str) and result.strip() else None + output = str(data.get("output") or "") + error = str(data.get("error") or "") + exit_code = data.get("exit_code") + parts = [f"Exit code: {exit_code}" if exit_code is not None else "Execution complete"] + if output: + parts.extend(["", "Output:", output]) + if error: + parts.extend(["", "Error:", error]) + return _truncate_text("\n".join(parts)) + + +def _extract_markdown_headings(content: str, limit: int = 8) -> list[str]: + headings: list[str] = [] + for line in content.splitlines(): + stripped = line.strip() + if stripped.startswith("#"): + heading = stripped.lstrip("#").strip() + if heading: + headings.append(heading) + if len(headings) >= limit: + break + return headings + + +def _format_skill_view_result(result: Optional[str]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return None + if data.get("success") is False: + return f"Skill view failed: {data.get('error', 'unknown error')}" + name = str(data.get("name") or "skill") + file_path = str(data.get("file") or data.get("path") or "SKILL.md") + description = str(data.get("description") or "").strip() + content = str(data.get("content") or "") + linked = data.get("linked_files") if isinstance(data.get("linked_files"), dict) else None + + lines = ["**Skill loaded**", "", f"- **Name:** `{name}`", f"- **File:** `{file_path}`"] + if description: + lines.append(f"- **Description:** {description}") + if content: + lines.append(f"- **Content:** {len(content):,} chars loaded into agent context") + if linked: + linked_count = sum(len(v) for v in linked.values() if isinstance(v, list)) + lines.append(f"- **Linked files:** {linked_count}") + + headings = _extract_markdown_headings(content) + if headings: + lines.extend(["", "**Sections**"]) + lines.extend(f"- {heading}" for heading in headings) + + lines.extend([ + "", + "_Full skill content is available to the agent but hidden here to keep ACP readable._", + ]) + return "\n".join(lines) + + +def _format_skill_manage_result(result: Optional[str], args: Optional[Dict[str, Any]]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return None + + action = str((args or {}).get("action") or "manage").strip() or "manage" + name = str((args or {}).get("name") or data.get("name") or "skill").strip() or "skill" + file_path = str((args or {}).get("file_path") or data.get("file_path") or "SKILL.md").strip() or "SKILL.md" + success = data.get("success") + status = "โœ… Skill updated" if success is not False else "โœ— Skill update failed" + + lines = [f"**{status}**", "", f"- **Action:** `{action}`", f"- **Skill:** `{name}`"] + if action not in {"delete"}: + lines.append(f"- **File:** `{file_path}`") + + message = str(data.get("message") or data.get("error") or "").strip() + if message: + lines.append(f"- **Result:** {message}") + + replacements = data.get("replacements") or data.get("replacement_count") + if replacements is not None: + lines.append(f"- **Replacements:** {replacements}") + + path = str(data.get("path") or "").strip() + if path: + lines.append(f"- **Path:** `{path}`") + + return "\n".join(lines) + + +def _format_web_search_result(result: Optional[str]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return None + web = data.get("data", {}).get("web") if isinstance(data.get("data"), dict) else data.get("web") + if not isinstance(web, list): + return None + lines = [f"Web results: {len(web)}"] + for item in web[:10]: + if not isinstance(item, dict): + continue + title = str(item.get("title") or item.get("url") or "result").strip() + url = str(item.get("url") or "").strip() + desc = str(item.get("description") or "").strip() + lines.append(f"โ€ข {title}" + (f" โ€” {url}" if url else "")) + if desc: + lines.append(f" {desc}") + return _truncate_text("\n".join(lines)) + + +def _format_web_extract_result(result: Optional[str]) -> Optional[str]: + """Return only web_extract errors for ACP; success stays compact via title.""" + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return None + if data.get("success") is False and data.get("error"): + return f"Web extract failed: {data.get('error')}" + results = data.get("results") + if not isinstance(results, list): + return None + + failures: list[str] = [] + for item in results[:10]: + if not isinstance(item, dict): + continue + error = str(item.get("error") or "").strip() + if not error or error in {"None", "null"}: + continue + url = str(item.get("url") or "").strip() + title = str(item.get("title") or url or "Untitled").strip() + failures.append( + f"- {title}" + (f" โ€” {url}" if url and url != title else "") + f"\n Error: {_truncate_text(error, limit=500)}" + ) + + if not failures: + return None + lines = [f"Web extract failed for {len(failures)} URL{'s' if len(failures) != 1 else ''}"] + lines.extend(failures) + return "\n".join(lines) + + +def _format_process_result(result: Optional[str], args: Optional[Dict[str, Any]]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return result if isinstance(result, str) and result.strip() else None + if data.get("success") is False and data.get("error"): + return f"Process error: {data.get('error')}" + action = str((args or {}).get("action") or "process").strip() or "process" + if isinstance(data.get("processes"), list): + processes = data["processes"] + lines = [f"Processes: {len(processes)}"] + for proc in processes[:20]: + if not isinstance(proc, dict): + lines.append(f"- {proc}") + continue + sid = str(proc.get("session_id") or proc.get("id") or "?") + status = str(proc.get("status") or ("exited" if proc.get("exited") else "running")) + cmd = str(proc.get("command") or "").strip() + pid = proc.get("pid") + code = proc.get("exit_code") + bits = [status] + if pid is not None: + bits.append(f"pid {pid}") + if code is not None: + bits.append(f"exit {code}") + lines.append(f"- `{sid}` โ€” {', '.join(bits)}" + (f" โ€” {cmd[:120]}" if cmd else "")) + if len(processes) > 20: + lines.append(f"... {len(processes) - 20} more process(es)") + return "\n".join(lines) + + status = str(data.get("status") or data.get("state") or action).strip() + sid = str(data.get("session_id") or (args or {}).get("session_id") or "").strip() + lines = [f"Process {action}: {status}" + (f" (`{sid}`)" if sid else "")] + for key, label in (("command", "Command"), ("pid", "PID"), ("exit_code", "Exit code"), ("returncode", "Exit code"), ("lines", "Lines")): + if data.get(key) is not None: + lines.append(f"- **{label}:** {data.get(key)}") + output = data.get("output") or data.get("new_output") or data.get("log") or data.get("stdout") + error = data.get("error") or data.get("stderr") + if output: + lines.extend(["", "Output:", _truncate_text(str(output), limit=5000)]) + if error: + lines.extend(["", "Error:", _truncate_text(str(error), limit=2000)]) + msg = data.get("message") + if msg and not output and not error: + lines.append(str(msg)) + return _truncate_text("\n".join(lines), limit=7000) + + +def _format_delegate_result(result: Optional[str]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return None + if data.get("error") and not isinstance(data.get("results"), list): + return f"Delegation failed: {data.get('error')}" + results = data.get("results") + if not isinstance(results, list): + return None + total = data.get("total_duration_seconds") + lines = [f"Delegation results: {len(results)} task{'s' if len(results) != 1 else ''}" + (f" in {total}s" if total is not None else "")] + icon = {"completed": "โœ…", "failed": "โœ—", "error": "โœ—", "timeout": "โฑ", "interrupted": "โš "} + for item in results: + if not isinstance(item, dict): + lines.append(f"- {item}") + continue + idx = item.get("task_index") + status = str(item.get("status") or "unknown") + model = item.get("model") + dur = item.get("duration_seconds") + role = item.get("_child_role") + header = f"{icon.get(status, 'โ€ข')} Task {idx + 1 if isinstance(idx, int) else '?'}: {status}" + bits = [] + if model: + bits.append(str(model)) + if role: + bits.append(f"role={role}") + if dur is not None: + bits.append(f"{dur}s") + if bits: + header += " (" + ", ".join(bits) + ")" + lines.extend(["", header]) + summary = str(item.get("summary") or "").strip() + error = str(item.get("error") or "").strip() + if summary: + lines.append(_truncate_text(summary, limit=1200)) + if error: + lines.append("Error: " + _truncate_text(error, limit=800)) + trace = item.get("tool_trace") + if isinstance(trace, list) and trace: + names = [str(t.get("tool") or "?") for t in trace if isinstance(t, dict)] + if names: + lines.append("Tools: " + ", ".join(names[:12]) + (f" (+{len(names)-12})" if len(names) > 12 else "")) + return _truncate_text("\n".join(lines), limit=8000) + + +def _format_session_search_result(result: Optional[str]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return None + if data.get("success") is False: + return f"Session search failed: {data.get('error', 'unknown error')}" + results = data.get("results") + if not isinstance(results, list): + return None + mode = data.get("mode") or "search" + query = data.get("query") + lines = ["Recent sessions" if mode == "recent" else f"Session search results" + (f" for `{query}`" if query else "")] + if not results: + lines.append(str(data.get("message") or "No matching sessions found.")) + return "\n".join(lines) + for item in results: + if not isinstance(item, dict): + continue + sid = str(item.get("session_id") or "?") + title = str(item.get("title") or item.get("when") or "Untitled session").strip() + when = str(item.get("last_active") or item.get("started_at") or item.get("when") or "").strip() + count = item.get("message_count") + source = str(item.get("source") or "").strip() + meta = ", ".join(str(x) for x in [when, source, f"{count} msgs" if count is not None else ""] if x) + lines.append(f"- **{title}** (`{sid}`)" + (f" โ€” {meta}" if meta else "")) + summary = str(item.get("summary") or item.get("preview") or "").strip() + if summary: + lines.append(" " + _truncate_text(" ".join(summary.split()), limit=500)) + return _truncate_text("\n".join(lines), limit=7000) + + +def _format_memory_result(result: Optional[str], args: Optional[Dict[str, Any]]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return None + action = str((args or {}).get("action") or "memory").strip() or "memory" + target = str(data.get("target") or (args or {}).get("target") or "memory") + if data.get("success") is False: + lines = [f"โœ— Memory {action} failed ({target})", str(data.get("error") or "unknown error")] + matches = data.get("matches") + if isinstance(matches, list) and matches: + lines.append("Matches:") + lines.extend(f"- {_truncate_text(str(m), 160)}" for m in matches[:5]) + return "\n".join(lines) + lines = [f"โœ… Memory {action} saved ({target})"] + if data.get("message"): + lines.append(str(data.get("message"))) + if data.get("entry_count") is not None: + lines.append(f"Entries: {data.get('entry_count')}") + if data.get("usage"): + lines.append(f"Usage: {data.get('usage')}") + # Avoid dumping all memory entries into ACP UI; show only the explicit new value preview. + preview = str((args or {}).get("content") or (args or {}).get("old_text") or "").strip() + if preview: + lines.append("Preview: " + _truncate_text(preview, limit=300)) + return "\n".join(lines) + + +def _format_edit_result(tool_name: str, result: Optional[str], args: Optional[Dict[str, Any]]) -> Optional[str]: + data = _json_loads_maybe(result) + path = str((args or {}).get("path") or "file").strip() + if isinstance(data, dict): + if data.get("success") is False or data.get("error"): + return f"{tool_name} failed for {path}: {data.get('error', 'unknown error')}" + message = str(data.get("message") or "").strip() + replacements = data.get("replacements") or data.get("replacement_count") + lines = [f"โœ… {tool_name} completed" + (f" for `{path}`" if path else "")] + if message: + lines.append(message) + if replacements is not None: + lines.append(f"Replacements: {replacements}") + if data.get("files_modified"): + files = data.get("files_modified") + if isinstance(files, list): + lines.append("Files: " + ", ".join(f"`{f}`" for f in files[:8])) + return "\n".join(lines) + if isinstance(result, str) and result.strip(): + return _truncate_text(result, limit=3000) + return f"โœ… {tool_name} completed" + (f" for `{path}`" if path else "") + + +def _format_browser_result(tool_name: str, result: Optional[str], args: Optional[Dict[str, Any]]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return result if isinstance(result, str) and result.strip() else None + if data.get("success") is False or data.get("error"): + return f"{tool_name} failed: {data.get('error', 'unknown error')}" + if tool_name == "browser_get_images": + images = data.get("images") or data.get("data") + if isinstance(images, list): + lines = [f"Images found: {len(images)}"] + for img in images[:12]: + if isinstance(img, dict): + alt = str(img.get("alt") or "").strip() + url = str(img.get("url") or img.get("src") or "").strip() + lines.append(f"- {alt or 'image'}" + (f" โ€” {url}" if url else "")) + return _truncate_text("\n".join(lines), limit=5000) + title = str(data.get("title") or data.get("url") or data.get("status") or tool_name) + text = str(data.get("text") or data.get("content") or data.get("snapshot") or data.get("analysis") or data.get("message") or "").strip() + lines = [title] + if data.get("url") and data.get("url") != title: + lines.append(str(data.get("url"))) + if text: + lines.extend(["", _truncate_text(text, limit=5000)]) + return _truncate_text("\n".join(lines), limit=7000) + + +def _format_media_or_cron_result(tool_name: str, result: Optional[str]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, dict): + return result if isinstance(result, str) and result.strip() else None + if data.get("success") is False or data.get("error"): + return f"{tool_name} failed: {data.get('error', 'unknown error')}" + lines = [f"โœ… {tool_name} completed"] + for key in ("file_path", "path", "url", "image_url", "job_id", "id", "status", "message", "next_run"): + if data.get(key): + lines.append(f"- **{key}:** {data.get(key)}") + return "\n".join(lines) + + +def _format_generic_structured_result(tool_name: str, result: Optional[str]) -> Optional[str]: + data = _json_loads_maybe(result) + if not isinstance(data, (dict, list)): + return result if isinstance(result, str) and result.strip() else None + if isinstance(data, list): + lines = [f"{tool_name}: {len(data)} item{'s' if len(data) != 1 else ''}"] + for item in data[:12]: + lines.append(f"- {_truncate_text(str(item), limit=240)}") + return _truncate_text("\n".join(lines), limit=5000) + + if data.get("success") is False or data.get("error"): + return f"{tool_name} failed: {data.get('error', 'unknown error')}" + + lines = [f"โœ… {tool_name} completed" if data.get("success") is True else f"{tool_name} result"] + priority_keys = ( + "message", "status", "id", "task_id", "issue_id", "title", "name", "entity_id", + "state", "service", "url", "path", "file_path", "count", "total", "next_run", + ) + seen = set() + for key in priority_keys: + value = data.get(key) + if value in (None, "", [], {}): + continue + seen.add(key) + lines.append(f"- **{key}:** {_truncate_text(str(value), limit=500)}") + + for key, value in data.items(): + if key in seen or key in {"success", "raw", "content", "entries"}: + continue + if value in (None, "", [], {}): + continue + if isinstance(value, (dict, list)): + preview = json.dumps(value, ensure_ascii=False, default=str) + else: + preview = str(value) + lines.append(f"- **{key}:** {_truncate_text(preview, limit=500)}") + if len(lines) >= 14: + break + + content = data.get("content") + if isinstance(content, str) and content.strip(): + lines.extend(["", _truncate_text(content.strip(), limit=1500)]) + return _truncate_text("\n".join(lines), limit=7000) + + +def _build_polished_completion_content( + tool_name: str, + result: Optional[str], + function_args: Optional[Dict[str, Any]], +) -> Optional[List[Any]]: + formatter = { + "todo": lambda: _format_todo_result(result), + "read_file": lambda: _format_read_file_result(result, function_args), + "write_file": lambda: _format_edit_result(tool_name, result, function_args), + "patch": lambda: _format_edit_result(tool_name, result, function_args), + "search_files": lambda: _format_search_files_result(result), + "execute_code": lambda: _format_execute_code_result(result), + "process": lambda: _format_process_result(result, function_args), + "delegate_task": lambda: _format_delegate_result(result), + "session_search": lambda: _format_session_search_result(result), + "memory": lambda: _format_memory_result(result, function_args), + "skill_view": lambda: _format_skill_view_result(result), + "skill_manage": lambda: _format_skill_manage_result(result, function_args), + "web_search": lambda: _format_web_search_result(result), + "web_extract": lambda: _format_web_extract_result(result), + "browser_navigate": lambda: _format_browser_result(tool_name, result, function_args), + "browser_snapshot": lambda: _format_browser_result(tool_name, result, function_args), + "browser_vision": lambda: _format_browser_result(tool_name, result, function_args), + "browser_get_images": lambda: _format_browser_result(tool_name, result, function_args), + "vision_analyze": lambda: _format_media_or_cron_result(tool_name, result), + "image_generate": lambda: _format_media_or_cron_result(tool_name, result), + "cronjob": lambda: _format_media_or_cron_result(tool_name, result), + }.get(tool_name) + if formatter is None and tool_name in _POLISHED_TOOLS: + formatter = lambda: _format_generic_structured_result(tool_name, result) + if formatter is None: + return None + text = formatter() + if not text: + return None + return [_text(text)] + + def _build_patch_mode_content(patch_text: str) -> List[Any]: """Parse V4A patch mode input into ACP diff blocks when possible.""" if not patch_text: @@ -258,7 +912,11 @@ def _build_tool_complete_content( except Exception: pass - return [acp.tool_content(acp.text_block(display_result))] + polished_content = _build_polished_completion_content(tool_name, result, function_args) + if polished_content: + return polished_content + + return [_text(display_result)] # --------------------------------------------------------------------------- @@ -288,7 +946,6 @@ def build_tool_start( content = _build_patch_mode_content(patch_text) return acp.start_tool_call( tool_call_id, title, kind=kind, content=content, locations=locations, - raw_input=arguments, ) if tool_name == "write_file": @@ -297,32 +954,172 @@ def build_tool_start( content = [acp.tool_diff_content(path=path, new_text=file_content)] return acp.start_tool_call( tool_call_id, title, kind=kind, content=content, locations=locations, - raw_input=arguments, ) if tool_name == "terminal": command = arguments.get("command", "") - content = [acp.tool_content(acp.text_block(f"$ {command}"))] + content = [_text(f"$ {command}")] return acp.start_tool_call( tool_call_id, title, kind=kind, content=content, locations=locations, - raw_input=arguments, ) if tool_name == "read_file": - path = arguments.get("path", "") - content = [acp.tool_content(acp.text_block(f"Reading {path}"))] + # The title and location already identify the file. Sending a synthetic + # "Reading ..." content block makes Zed render an unhelpful Output + # section before the real file contents arrive on completion. return acp.start_tool_call( - tool_call_id, title, kind=kind, content=content, locations=locations, - raw_input=arguments, + tool_call_id, title, kind=kind, content=None, locations=locations, ) if tool_name == "search_files": pattern = arguments.get("pattern", "") target = arguments.get("target", "content") - content = [acp.tool_content(acp.text_block(f"Searching for '{pattern}' ({target})"))] + search_path = arguments.get("path") + where = f" in {search_path}" if search_path else "" + content = [_text(f"Searching for '{pattern}' ({target}){where}")] + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name == "todo": + items = arguments.get("todos") + if isinstance(items, list): + preview_lines = ["Updating todo list", ""] + for item in items[:8]: + if isinstance(item, dict): + preview_lines.append(f"- {item.get('status', 'pending')}: {item.get('content', item.get('id', ''))}") + if len(items) > 8: + preview_lines.append(f"... {len(items) - 8} more") + content = [_text("\n".join(preview_lines))] + else: + content = [_text("Reading todo list")] + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name == "skill_view": + name = str(arguments.get("name") or "?").strip() or "?" + file_path = str(arguments.get("file_path") or "SKILL.md").strip() or "SKILL.md" + content = [_text(f"Loading skill '{name}' ({file_path})")] + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name == "skill_manage": + action = str(arguments.get("action") or "manage").strip() or "manage" + name = str(arguments.get("name") or "?").strip() or "?" + file_path = str(arguments.get("file_path") or "SKILL.md").strip() or "SKILL.md" + path = f"skills/{name}/{file_path}" if file_path else f"skills/{name}" + + if action == "patch": + old = str(arguments.get("old_string") or "") + new = str(arguments.get("new_string") or "") + content = [acp.tool_diff_content(path=path, old_text=old or None, new_text=new)] + elif action in {"edit", "create"}: + content = [ + acp.tool_diff_content( + path=path, + new_text=str(arguments.get("content") or ""), + ) + ] + elif action == "write_file": + target = str(arguments.get("file_path") or "file") + content = [ + acp.tool_diff_content( + path=f"skills/{name}/{target}", + new_text=str(arguments.get("file_content") or ""), + ) + ] + elif action in {"delete", "remove_file"}: + target = str(arguments.get("file_path") or file_path or name) + content = [_text(f"Removing {target} from skill '{name}'")] + else: + content = [_text(f"Running skill_manage action '{action}' on skill '{name}' ({file_path})")] + + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name == "execute_code": + code = str(arguments.get("code") or "").strip() + preview = code[:1200] + (f"\n... ({len(code)} chars total, truncated)" if len(code) > 1200 else "") + content = [_text(f"Running Python helper script:\n\n```python\n{preview}\n```" if preview else "Running Python helper script")] + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name == "web_search": + query = str(arguments.get("query") or "").strip() + content = [_text(f"Searching the web for: {query}" if query else "Searching the web")] + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name == "web_extract": + # The title identifies the URL(s). Avoid a duplicate content block so + # Zed renders this like read_file: compact start, concise completion. + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=None, locations=locations, + ) + + if tool_name == "process": + action = str(arguments.get("action") or "").strip() or "manage" + sid = str(arguments.get("session_id") or "").strip() + data_preview = str(arguments.get("data") or "").strip() + text = f"Process action: {action}" + (f"\nSession: {sid}" if sid else "") + if data_preview: + text += "\nInput: " + _truncate_text(data_preview, limit=500) + content = [_text(text)] + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name == "delegate_task": + tasks = arguments.get("tasks") + if isinstance(tasks, list) and tasks: + lines = [f"Delegating {len(tasks)} tasks", ""] + for i, task in enumerate(tasks[:8], 1): + if isinstance(task, dict): + goal = str(task.get("goal") or "").strip() + role = str(task.get("role") or "").strip() + lines.append(f"{i}. " + _truncate_text(goal, limit=160) + (f" ({role})" if role else "")) + if len(tasks) > 8: + lines.append(f"... {len(tasks) - 8} more") + content = [_text("\n".join(lines))] + else: + goal = str(arguments.get("goal") or "").strip() + content = [_text("Delegating task" + (f":\n{_truncate_text(goal, limit=800)}" if goal else ""))] + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name == "session_search": + query = str(arguments.get("query") or "").strip() + content = [_text(f"Searching past sessions for: {query}" if query else "Loading recent sessions")] + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name == "memory": + action = str(arguments.get("action") or "manage").strip() or "manage" + target = str(arguments.get("target") or "memory").strip() or "memory" + preview = str(arguments.get("content") or arguments.get("old_text") or "").strip() + text = f"Memory {action} ({target})" + if preview: + text += "\nPreview: " + _truncate_text(preview, limit=500) + content = [_text(text)] + return acp.start_tool_call( + tool_call_id, title, kind=kind, content=content, locations=locations, + ) + + if tool_name in _POLISHED_TOOLS: + try: + args_text = json.dumps(arguments, indent=2, default=str) + except (TypeError, ValueError): + args_text = str(arguments) + content = [_text(_truncate_text(args_text, limit=1200))] return acp.start_tool_call( tool_call_id, title, kind=kind, content=content, locations=locations, - raw_input=arguments, ) # Generic fallback @@ -334,7 +1131,7 @@ def build_tool_start( content = [acp.tool_content(acp.text_block(args_text))] return acp.start_tool_call( tool_call_id, title, kind=kind, content=content, locations=locations, - raw_input=arguments, + raw_input=None if tool_name in _POLISHED_TOOLS else arguments, ) @@ -347,18 +1144,22 @@ def build_tool_complete( ) -> ToolCallProgress: """Create a ToolCallUpdate (progress) event for a completed tool call.""" kind = get_tool_kind(tool_name) - content = _build_tool_complete_content( - tool_name, - result, - function_args=function_args, - snapshot=snapshot, - ) + if tool_name == "web_extract": + error_text = _format_web_extract_result(result) + content = [_text(error_text)] if error_text else None + else: + content = _build_tool_complete_content( + tool_name, + result, + function_args=function_args, + snapshot=snapshot, + ) return acp.update_tool_call( tool_call_id, kind=kind, status="completed", content=content, - raw_output=result, + raw_output=None if tool_name in _POLISHED_TOOLS else result, ) diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index efee8f6bf1..8d8334acd1 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -1241,10 +1241,24 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]: if not tools: return [] result = [] + seen_names: set = set() for t in tools: fn = t.get("function", {}) + name = fn.get("name", "") + # Defensive dedup: Anthropic rejects requests with duplicate tool + # names. Upstream injection paths already dedup, but this guard + # converts a hard API failure into a warning. See: #18478 + if name and name in seen_names: + logger.warning( + "convert_tools_to_anthropic: duplicate tool name '%s' " + "โ€” dropping second occurrence", + name, + ) + continue + if name: + seen_names.add(name) result.append({ - "name": fn.get("name", ""), + "name": name, "description": fn.get("description", ""), "input_schema": _normalize_tool_input_schema( fn.get("parameters", {"type": "object", "properties": {}}) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index df3fdeccc6..b86f78f8ec 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -259,13 +259,68 @@ _PROVIDERS_WITHOUT_VISION: frozenset = frozenset({ "kimi-coding-cn", }) -# OpenRouter app attribution headers -_OR_HEADERS = { +# OpenRouter app attribution headers (base โ€” always sent) +_OR_HEADERS_BASE = { "HTTP-Referer": "https://hermes-agent.nousresearch.com", "X-OpenRouter-Title": "Hermes Agent", "X-OpenRouter-Categories": "productivity,cli-agent", } +# Truthy values for boolean env-var parsing. +_TRUTHY_ENV_VALUES = frozenset({"1", "true", "yes", "on"}) + + +def build_or_headers(or_config: dict | None = None) -> dict: + """Build OpenRouter headers, optionally including response-cache headers. + + Precedence for response cache: env var > config.yaml > default (enabled). + + Environment variables: + ``HERMES_OPENROUTER_CACHE`` โ€” truthy (``1``/``true``/``yes``/``on``) + enables caching; ``0``/``false``/``no``/``off`` disables. + Overrides ``openrouter.response_cache`` in config.yaml. + ``HERMES_OPENROUTER_CACHE_TTL`` โ€” integer seconds (1-86400). + Overrides ``openrouter.response_cache_ttl`` in config.yaml. + + *or_config* is the ``openrouter`` section from config.yaml. When *None*, + falls back to reading config from disk via ``load_config()``. + """ + headers = dict(_OR_HEADERS_BASE) + + # Resolve config from disk if not provided. + if or_config is None: + try: + from hermes_cli.config import load_config + or_config = load_config().get("openrouter", {}) + except Exception: + or_config = {} + + # Determine cache enabled: env var overrides config. + env_cache = os.environ.get("HERMES_OPENROUTER_CACHE", "").strip().lower() + if env_cache: + cache_enabled = env_cache in _TRUTHY_ENV_VALUES + else: + cache_enabled = or_config.get("response_cache", False) + + if not cache_enabled: + return headers + + headers["X-OpenRouter-Cache"] = "true" + + # Determine TTL: env var overrides config. + env_ttl = os.environ.get("HERMES_OPENROUTER_CACHE_TTL", "").strip() + if env_ttl: + if env_ttl.isdigit(): + ttl = int(env_ttl) + if 1 <= ttl <= 86400: + headers["X-OpenRouter-Cache-TTL"] = str(ttl) + else: + ttl = or_config.get("response_cache_ttl", 300) + if isinstance(ttl, (int, float)) and 1 <= ttl <= 86400: + headers["X-OpenRouter-Cache-TTL"] = str(int(ttl)) + + return headers + # Vercel AI Gateway app attribution headers. HTTP-Referer maps to # referrerUrl and X-Title maps to appName in the gateway's analytics. from hermes_cli import __version__ as _HERMES_VERSION @@ -1149,23 +1204,23 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]: -def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]: +def _try_openrouter(explicit_api_key: str = None) -> Tuple[Optional[OpenAI], Optional[str]]: pool_present, entry = _select_pool_entry("openrouter") if pool_present: - or_key = _pool_runtime_api_key(entry) + or_key = explicit_api_key or _pool_runtime_api_key(entry) if not or_key: return None, None base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL logger.debug("Auxiliary client: OpenRouter via pool") return OpenAI(api_key=or_key, base_url=base_url, - default_headers=_OR_HEADERS), _OPENROUTER_MODEL + default_headers=build_or_headers()), _OPENROUTER_MODEL - or_key = os.getenv("OPENROUTER_API_KEY") + or_key = explicit_api_key or os.getenv("OPENROUTER_API_KEY") if not or_key: return None, None logger.debug("Auxiliary client: OpenRouter") return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL, - default_headers=_OR_HEADERS), _OPENROUTER_MODEL + default_headers=build_or_headers()), _OPENROUTER_MODEL def _describe_openrouter_unavailable() -> str: @@ -1911,7 +1966,7 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False): } sync_base_url = str(sync_client.base_url) if base_url_host_matches(sync_base_url, "openrouter.ai"): - async_kwargs["default_headers"] = dict(_OR_HEADERS) + async_kwargs["default_headers"] = build_or_headers() elif base_url_host_matches(sync_base_url, "api.githubcopilot.com"): from hermes_cli.copilot_auth import copilot_request_headers @@ -2053,9 +2108,9 @@ def resolve_provider_client( return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode else (client, final_model)) - # โ”€โ”€ OpenRouter โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # โ”€โ”€ OpenRouter โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ if provider == "openrouter": - client, default = _try_openrouter() + client, default = _try_openrouter(explicit_api_key=explicit_api_key) if client is None: logger.warning( "resolve_provider_client: openrouter requested but %s", @@ -3237,7 +3292,26 @@ def _build_call_kwargs( kwargs["max_tokens"] = max_tokens if tools: - kwargs["tools"] = tools + # Defensive dedup: providers like Google Vertex, Azure, and Bedrock + # reject requests with duplicate tool names (HTTP 400). The upstream + # injection paths (run_agent.py) already dedup, but this guard + # converts a hard API failure into a warning if an upstream regression + # reintroduces duplicates. See: #18478 + _seen: set = set() + _deduped: list = [] + for _t in tools: + _tname = (_t.get("function") or {}).get("name", "") + if _tname and _tname in _seen: + logger.warning( + "_build_call_kwargs: duplicate tool name '%s' removed " + "(provider=%s model=%s)", + _tname, provider, model, + ) + continue + if _tname: + _seen.add(_tname) + _deduped.append(_t) + kwargs["tools"] = _deduped # Provider-specific extra_body merged_extra = dict(extra_body or {}) diff --git a/agent/credential_pool.py b/agent/credential_pool.py index 004b574988..27a16bd435 100644 --- a/agent/credential_pool.py +++ b/agent/credential_pool.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import os import random import threading import time @@ -13,7 +14,7 @@ from datetime import datetime from typing import Any, Dict, List, Optional, Set, Tuple from hermes_constants import OPENROUTER_BASE_URL -from hermes_cli.config import get_env_value +from hermes_cli.config import get_env_value, load_env import hermes_cli.auth as auth_mod from hermes_cli.auth import ( CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS, @@ -1380,6 +1381,16 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup def _seed_from_env(provider: str, entries: List[PooledCredential]) -> Tuple[bool, Set[str]]: changed = False active_sources: Set[str] = set() + + # Prefer ~/.hermes/.env over os.environ โ€” the user's config file is the + # authoritative source for Hermes credentials. Stale env vars from parent + # processes (Codex CLI, test scripts, etc.) should not override deliberate + # changes to the .env file. + def _get_env_prefer_dotenv(key: str) -> str: + env_file = load_env() + val = env_file.get(key) or os.environ.get(key) or "" + return val.strip() + # Honour user suppression โ€” `hermes auth remove ` for an # env-seeded credential marks the env: source as suppressed so it # won't be re-seeded from the user's shell environment or ~/.hermes/.env. @@ -1391,8 +1402,8 @@ def _seed_from_env(provider: str, entries: List[PooledCredential]) -> Tuple[bool def _is_source_suppressed(_p, _s): # type: ignore[misc] return False if provider == "openrouter": - # Check both os.environ and ~/.hermes/.env file - token = (get_env_value("OPENROUTER_API_KEY") or "").strip() + # Prefer ~/.hermes/.env over os.environ + token = _get_env_prefer_dotenv("OPENROUTER_API_KEY") if token: source = "env:OPENROUTER_API_KEY" if _is_source_suppressed(provider, source): @@ -1418,7 +1429,7 @@ def _seed_from_env(provider: str, entries: List[PooledCredential]) -> Tuple[bool env_url = "" if pconfig.base_url_env_var: - env_url = (get_env_value(pconfig.base_url_env_var) or "").strip().rstrip("/") + env_url = _get_env_prefer_dotenv(pconfig.base_url_env_var).rstrip("/") env_vars = list(pconfig.api_key_env_vars) if provider == "anthropic": @@ -1429,8 +1440,8 @@ def _seed_from_env(provider: str, entries: List[PooledCredential]) -> Tuple[bool ] for env_var in env_vars: - # Check both os.environ and ~/.hermes/.env file - token = (get_env_value(env_var) or "").strip() + # Prefer ~/.hermes/.env over os.environ + token = _get_env_prefer_dotenv(env_var) if not token: continue source = f"env:{env_var}" diff --git a/agent/curator.py b/agent/curator.py index 2eebe10ef5..cce3d8c103 100644 --- a/agent/curator.py +++ b/agent/curator.py @@ -387,6 +387,11 @@ CURATOR_REVIEW_PROMPT = ( " - skill_manage action=write_file โ€” add a references/, templates/, " "or scripts/ file under an existing skill (the skill must already " "exist)\n" + " - skill_manage action=delete โ€” archive a skill. MUST pass " + "`absorbed_into=` when you've merged its content into another " + "skill, or `absorbed_into=\"\"` when you're truly pruning with no " + "forwarding target. This drives cron-job skill-reference migration โ€” " + "guessing from your YAML summary after the fact is fragile.\n" " - terminal โ€” mv a sibling into the archive " "OR move its content into a support subfile\n\n" "'keep' is a legitimate decision ONLY when the skill is already a " @@ -637,15 +642,76 @@ def _parse_structured_summary( return out +def _extract_absorbed_into_declarations( + tool_calls: List[Dict[str, Any]], +) -> Dict[str, Dict[str, Any]]: + """Walk this run's tool calls and extract model-declared absorption targets. + + The curator prompt requires every ``skill_manage(action='delete')`` call + to pass ``absorbed_into=`` when consolidating, or + ``absorbed_into=""`` when truly pruning. This is the single authoritative + signal for classification โ€” the model's own declaration at the moment of + deletion, which beats both post-hoc YAML summary parsing and substring + heuristics on other tool calls. + + Returns ``{skill_name: {"into": "" | "", "declared": True}}``. + Entries with ``into == ""`` are explicit prunings. + Skills without a ``skill_manage(delete)`` call, or with one that omitted + ``absorbed_into``, are not in the returned dict โ€” caller falls back to + the existing heuristic/YAML logic for those (backward compat with older + curator runs and any callers that don't populate the arg). + """ + out: Dict[str, Dict[str, Any]] = {} + for tc in tool_calls or []: + if not isinstance(tc, dict): + continue + if tc.get("name") != "skill_manage": + continue + raw = tc.get("arguments") or "" + args: Dict[str, Any] = {} + if isinstance(raw, dict): + args = raw + elif isinstance(raw, str): + try: + args = json.loads(raw) + except Exception: + continue + if not isinstance(args, dict): + continue + if args.get("action") != "delete": + continue + name = args.get("name") + if not isinstance(name, str) or not name.strip(): + continue + # absorbed_into must be present (even empty string is meaningful); + # missing key means the model didn't declare intent. + if "absorbed_into" not in args: + continue + target = args.get("absorbed_into") + if target is None: + continue + if not isinstance(target, str): + continue + out[name.strip()] = {"into": target.strip(), "declared": True} + return out + + def _reconcile_classification( removed: List[str], heuristic: Dict[str, List[Dict[str, Any]]], model_block: Dict[str, List[Dict[str, str]]], destinations: Set[str], + absorbed_declarations: Optional[Dict[str, Dict[str, Any]]] = None, ) -> Dict[str, List[Dict[str, Any]]]: """Merge heuristic (tool-call evidence) with the model's structured block. - Rules: + Rules (evaluated in order; first match wins): + - **Model-declared `absorbed_into` at delete time is authoritative.** Any + entry in ``absorbed_declarations`` beats every other signal. This is + the model telling us directly, at the moment of deletion, what it did. + ``into != ""`` and target exists โ†’ consolidated. ``into == ""`` โ†’ + pruned. ``into != ""`` but target doesn't exist โ†’ hallucination; fall + through to the usual signals. - Model-declared consolidation wins when its ``into`` target exists in ``destinations`` (survived or newly-created). This gives the model authority over intent + rationale. @@ -666,6 +732,8 @@ def _reconcile_classification( model_cons = {e["from"]: e for e in model_block.get("consolidations", [])} model_pruned = {e["name"]: e for e in model_block.get("prunings", [])} + declared = absorbed_declarations or {} + consolidated: List[Dict[str, Any]] = [] pruned: List[Dict[str, Any]] = [] @@ -673,6 +741,36 @@ def _reconcile_classification( mc = model_cons.get(name) mp = model_pruned.get(name) hc = heur_cons.get(name) + dec = declared.get(name) + + # Authoritative: model declared `absorbed_into` at the delete call. + if dec is not None: + into_claim = dec.get("into", "") + if into_claim and into_claim in destinations: + entry: Dict[str, Any] = { + "name": name, + "into": into_claim, + "source": "absorbed_into (model-declared at delete)", + "reason": (mc.get("reason") or "") if mc else "", + } + if hc and hc.get("evidence"): + entry["evidence"] = hc["evidence"] + consolidated.append(entry) + continue + if into_claim == "": + # Explicit prune declaration + pruned.append({ + "name": name, + "source": "absorbed_into=\"\" (model-declared prune)", + "reason": (mp.get("reason") or "") if mp else "", + }) + continue + # into_claim is non-empty but target doesn't exist: the model + # named a nonexistent umbrella at delete time. The tool already + # rejects this at the skill_manage layer, so we shouldn't see it + # in practice โ€” but if it slips through (e.g. the umbrella was + # deleted LATER in the same run), fall through to the usual + # signals rather than trusting a broken reference. # Model says consolidated โ€” trust it if the destination is real. if mc and mc.get("into") in destinations: @@ -808,11 +906,20 @@ def _write_run_report( ) model_block = _parse_structured_summary(llm_meta.get("final", "") or "") destinations = set(after_names) | set(added or []) + # Authoritative signal: extract per-delete `absorbed_into` declarations + # from this run's tool calls. These beat both the YAML summary block and + # the substring heuristic โ€” the model is telling us directly, at the + # moment of deletion, whether each archived skill was consolidated + # (into=) or pruned (into=""). + absorbed_declarations = _extract_absorbed_into_declarations( + llm_meta.get("tool_calls", []) or [] + ) classification = _reconcile_classification( removed=removed, heuristic=heuristic, model_block=model_block, destinations=destinations, + absorbed_declarations=absorbed_declarations, ) consolidated = classification["consolidated"] pruned = classification["pruned"] diff --git a/agent/curator_backup.py b/agent/curator_backup.py index 268de64f41..fe74920521 100644 --- a/agent/curator_backup.py +++ b/agent/curator_backup.py @@ -21,6 +21,18 @@ It DOES include: pointer โ€” otherwise the curator would immediately re-fire on the next tick) - ``.bundled_manifest`` (so protection markers stay consistent) + +Alongside the skills tarball, each snapshot also captures a copy of +``~/.hermes/cron/jobs.json`` as ``cron-jobs.json`` when it exists. Cron +jobs reference skills by name in their ``skills``/``skill`` fields; the +curator's consolidation pass rewrites those in place via +``cron.jobs.rewrite_skill_refs()``. Without capturing the pre-run state, +rolling back the skills tree would leave cron jobs pointing at the +umbrella skills even though the narrow skills they were originally +configured with have been restored. We store the whole jobs.json for +fidelity but rollback only touches the ``skills``/``skill`` fields โ€” the +rest (schedule, next_run_at, enabled, prompt, etc.) is live state and +we leave it alone. """ from __future__ import annotations @@ -63,6 +75,60 @@ def _skills_dir() -> Path: return get_hermes_home() / "skills" +def _cron_jobs_file() -> Path: + """Source path for the live cron jobs store (``~/.hermes/cron/jobs.json``).""" + return get_hermes_home() / "cron" / "jobs.json" + + +CRON_JOBS_FILENAME = "cron-jobs.json" + + +def _backup_cron_jobs_into(dest: Path) -> Dict[str, Any]: + """Copy the live cron jobs.json into ``dest`` as ``cron-jobs.json``. + + Returns a small dict describing what was captured so the caller can + fold it into the manifest. Never raises โ€” if the cron file is missing + or unreadable, the return dict has ``backed_up=False`` and the reason, + and the snapshot proceeds without cron data (the snapshot is still + useful for rolling back skills). + """ + src = _cron_jobs_file() + info: Dict[str, Any] = {"backed_up": False, "jobs_count": 0} + if not src.exists(): + info["reason"] = "no cron/jobs.json present" + return info + try: + raw = src.read_text(encoding="utf-8") + except OSError as e: + logger.debug("Failed to read cron/jobs.json for backup: %s", e) + info["reason"] = f"read error: {e}" + return info + # Count jobs as a nice diagnostic โ€” but don't fail the snapshot if the + # file is unparseable; just store the raw text and let rollback deal + # with it (or not, if it's corrupted). jobs.json wraps the list as + # `{"jobs": [...], "updated_at": ...}` โ€” we count via that shape, and + # fall back to bare-list shape just in case the format ever changes. + try: + parsed = json.loads(raw) + if isinstance(parsed, dict): + inner = parsed.get("jobs") + if isinstance(inner, list): + info["jobs_count"] = len(inner) + elif isinstance(parsed, list): + info["jobs_count"] = len(parsed) + except (json.JSONDecodeError, TypeError): + info["jobs_count"] = 0 + info["parse_warning"] = "jobs.json was not valid JSON at snapshot time" + try: + (dest / CRON_JOBS_FILENAME).write_text(raw, encoding="utf-8") + except OSError as e: + logger.debug("Failed to write cron backup file: %s", e) + info["reason"] = f"write error: {e}" + return info + info["backed_up"] = True + return info + + def _utc_id(now: Optional[datetime] = None) -> str: """UTC ISO-ish filesystem-safe timestamp: ``2026-05-01T13-05-42Z``.""" if now is None: @@ -116,7 +182,8 @@ def _count_skill_files(base: Path) -> int: def _write_manifest(dest: Path, reason: str, archive_path: Path, - skills_counted: int) -> None: + skills_counted: int, + cron_info: Optional[Dict[str, Any]] = None) -> None: manifest = { "id": dest.name, "reason": reason, @@ -125,6 +192,15 @@ def _write_manifest(dest: Path, reason: str, archive_path: Path, "archive_bytes": archive_path.stat().st_size, "skill_files": skills_counted, } + if cron_info is not None: + manifest["cron_jobs"] = { + "backed_up": bool(cron_info.get("backed_up", False)), + "jobs_count": int(cron_info.get("jobs_count", 0)), + } + if not cron_info.get("backed_up"): + manifest["cron_jobs"]["reason"] = cron_info.get("reason", "not captured") + if cron_info.get("parse_warning"): + manifest["cron_jobs"]["parse_warning"] = cron_info["parse_warning"] (dest / "manifest.json").write_text( json.dumps(manifest, indent=2, sort_keys=True), encoding="utf-8" ) @@ -181,7 +257,14 @@ def snapshot_skills(reason: str = "manual") -> Optional[Path]: # arcname: store paths relative to skills/ so extraction # drops cleanly back into the skills dir. tf.add(str(entry), arcname=entry.name, recursive=True) - _write_manifest(dest, reason, archive, _count_skill_files(skills)) + # Capture cron/jobs.json alongside the tarball. Never fails the + # snapshot โ€” the skills side is the core guarantee; cron is + # additive. We still record in the manifest whether it was + # captured so rollback can surface "no cron data in this snapshot". + cron_info = _backup_cron_jobs_into(dest) + _write_manifest(dest, reason, archive, + _count_skill_files(skills), + cron_info=cron_info) except (OSError, tarfile.TarError) as e: logger.debug("Curator snapshot failed: %s", e, exc_info=True) # Clean up partial snapshot @@ -298,6 +381,149 @@ def _resolve_backup(backup_id: Optional[str]) -> Optional[Path]: return candidates[0] if candidates else None +def _restore_cron_skill_links(snapshot_dir: Path) -> Dict[str, Any]: + """Reconcile backed-up cron skill links into the live ``cron/jobs.json``. + + We do NOT overwrite the whole cron file. Only the ``skills`` and + ``skill`` fields are restored, and only on jobs that still exist in the + current file (matched by ``id``). Everything else about the job โ€” + schedule, next_run_at, last_run_at, enabled, prompt, workdir, hooks โ€” + is live state that the user/scheduler has modified since the snapshot; + overwriting it would regress unrelated cron activity. + + Rules: + - Jobs present in backup AND live, with differing skills โ†’ skills restored. + - Jobs present in backup AND live, with matching skills โ†’ no-op. + - Jobs present in backup but gone from live (user deleted the job + after the snapshot) โ†’ skipped, noted in the return report. + - Jobs present in live but not in backup (user created a new cron + job after the snapshot) โ†’ left untouched. + + Never raises; failures are captured in the return dict. Writes through + ``cron.jobs`` to pick up the same lock + atomic-write path that tick() + uses, so we don't race the scheduler. + """ + report: Dict[str, Any] = { + "attempted": False, + "restored": [], + "skipped_missing": [], + "unchanged": 0, + "error": None, + } + backup_file = snapshot_dir / CRON_JOBS_FILENAME + if not backup_file.exists(): + report["error"] = f"snapshot has no {CRON_JOBS_FILENAME}" + return report + + try: + backup_text = backup_file.read_text(encoding="utf-8") + backup_parsed = json.loads(backup_text) + except (OSError, json.JSONDecodeError) as e: + report["error"] = f"failed to load backed-up jobs: {e}" + return report + # jobs.json on disk is `{"jobs": [...], "updated_at": ...}`; accept both + # that shape and a bare list for forward compat. + if isinstance(backup_parsed, dict): + backup_jobs = backup_parsed.get("jobs") + elif isinstance(backup_parsed, list): + backup_jobs = backup_parsed + else: + backup_jobs = None + if not isinstance(backup_jobs, list): + report["error"] = "backed-up cron-jobs.json has no jobs list" + return report + + # Build a lookup of the backed-up skill state keyed by job id. + # We only need the two skill-ish fields (legacy single and modern list). + backup_by_id: Dict[str, Dict[str, Any]] = {} + for job in backup_jobs: + if not isinstance(job, dict): + continue + jid = job.get("id") + if not isinstance(jid, str) or not jid: + continue + backup_by_id[jid] = { + "skills": job.get("skills"), + "skill": job.get("skill"), + "name": job.get("name") or jid, + } + + if not backup_by_id: + report["attempted"] = True # we tried but there was nothing to do + return report + + # Load and rewrite the live jobs under the scheduler's lock. + try: + from cron.jobs import load_jobs, save_jobs, _jobs_file_lock + except ImportError as e: + report["error"] = f"cron module unavailable: {e}" + return report + + report["attempted"] = True + try: + with _jobs_file_lock: + live_jobs = load_jobs() + changed = False + + live_ids = set() + for live in live_jobs: + if not isinstance(live, dict): + continue + jid = live.get("id") + if not isinstance(jid, str) or not jid: + continue + live_ids.add(jid) + + backup = backup_by_id.get(jid) + if backup is None: + continue # live job didn't exist at snapshot time + + cur_skills = live.get("skills") + cur_skill = live.get("skill") + bkp_skills = backup.get("skills") + bkp_skill = backup.get("skill") + + if cur_skills == bkp_skills and cur_skill == bkp_skill: + report["unchanged"] += 1 + continue + + # Restore. Preserve absence (don't force the key to appear + # if the backup didn't have it either). + if bkp_skills is None: + live.pop("skills", None) + else: + live["skills"] = bkp_skills + if bkp_skill is None: + live.pop("skill", None) + else: + live["skill"] = bkp_skill + + report["restored"].append({ + "job_id": jid, + "job_name": backup.get("name") or jid, + "from": {"skills": cur_skills, "skill": cur_skill}, + "to": {"skills": bkp_skills, "skill": bkp_skill}, + }) + changed = True + + # Jobs in backup but not in live = user deleted them after snapshot + for jid, backup in backup_by_id.items(): + if jid not in live_ids: + report["skipped_missing"].append({ + "job_id": jid, + "job_name": backup.get("name") or jid, + }) + + if changed: + save_jobs(live_jobs) + except Exception as e: # noqa: BLE001 โ€” rollback must not die mid-restore + logger.debug("Cron skill-link restore failed: %s", e, exc_info=True) + report["error"] = f"restore failed mid-flight: {e}" + + return report + + + def rollback(backup_id: Optional[str] = None) -> Tuple[bool, str, Optional[Path]]: """Restore ``~/.hermes/skills/`` from a snapshot. @@ -408,8 +634,35 @@ def rollback(backup_id: Optional[str] = None) -> Tuple[bool, str, Optional[Path] except OSError: pass - logger.info("Curator rollback: restored from %s", target.name) - return (True, f"restored from snapshot {target.name}", target) + # Reconcile cron skill-links. Surgical: only the skills/skill fields + # on jobs matched by id. Everything else in jobs.json is live state + # (schedule, next_run_at, enabled, prompt, etc.) and we leave it + # alone. Failures here don't fail the overall rollback โ€” the skills + # tree is already restored, which is the main guarantee. + cron_report = _restore_cron_skill_links(target) + + summary_bits = [f"restored from snapshot {target.name}"] + if cron_report.get("attempted"): + restored_n = len(cron_report.get("restored") or []) + skipped_n = len(cron_report.get("skipped_missing") or []) + if cron_report.get("error"): + summary_bits.append(f"cron links: error โ€” {cron_report['error']}") + elif restored_n == 0 and skipped_n == 0 and cron_report.get("unchanged", 0) == 0: + # Attempted but nothing matched โ€” empty snapshot or no overlapping ids. + pass + else: + parts = [] + if restored_n: + parts.append(f"{restored_n} job(s) had skill links restored") + if skipped_n: + parts.append(f"{skipped_n} backed-up job(s) no longer exist (skipped)") + if cron_report.get("unchanged"): + parts.append(f"{cron_report['unchanged']} already matched") + summary_bits.append("cron links: " + ", ".join(parts)) + + logger.info("Curator rollback: restored from %s (cron_report=%s)", + target.name, cron_report) + return (True, "; ".join(summary_bits), target) # --------------------------------------------------------------------------- diff --git a/agent/skill_commands.py b/agent/skill_commands.py index ad1f03824d..0276d5fc9a 100644 --- a/agent/skill_commands.py +++ b/agent/skill_commands.py @@ -6,6 +6,7 @@ can invoke skills via /skill-name commands. import json import logging +import os import re from pathlib import Path from typing import Any, Dict, Optional @@ -20,10 +21,35 @@ from agent.skill_preprocessing import ( logger = logging.getLogger(__name__) _skill_commands: Dict[str, Dict[str, Any]] = {} +_skill_commands_platform: Optional[str] = None # Patterns for sanitizing skill names into clean hyphen-separated slugs. _SKILL_INVALID_CHARS = re.compile(r"[^a-z0-9-]") _SKILL_MULTI_HYPHEN = re.compile(r"-{2,}") + +def _resolve_skill_commands_platform() -> Optional[str]: + """Return the current platform scope used for disabled-skill filtering. + + Used to detect when the active platform has shifted so + :func:`get_skill_commands` can drop a stale cache that was populated + for a different platform's ``skills.platform_disabled`` view (#14536). + + Resolves from (in order) ``HERMES_PLATFORM`` env var and + ``HERMES_SESSION_PLATFORM`` from the gateway session context. Returns + ``None`` when no platform scope is active (e.g. classic CLI, RL + rollouts, standalone scripts). + """ + try: + from gateway.session_context import get_session_env + + resolved_platform = ( + os.getenv("HERMES_PLATFORM") + or get_session_env("HERMES_SESSION_PLATFORM") + ) + except Exception: + resolved_platform = os.getenv("HERMES_PLATFORM") + return resolved_platform or None + def _load_skill_payload(skill_identifier: str, task_id: str | None = None) -> tuple[dict[str, Any], Path | None, str] | None: """Load a skill by name/path and return (loaded_payload, skill_dir, display_name).""" raw_identifier = (skill_identifier or "").strip() @@ -218,7 +244,8 @@ def scan_skill_commands() -> Dict[str, Dict[str, Any]]: Returns: Dict mapping "/skill-name" to {name, description, skill_md_path, skill_dir}. """ - global _skill_commands + global _skill_commands, _skill_commands_platform + _skill_commands_platform = _resolve_skill_commands_platform() _skill_commands = {} try: from tools.skills_tool import SKILLS_DIR, _parse_frontmatter, skill_matches_platform, _get_disabled_skill_names @@ -278,8 +305,16 @@ def scan_skill_commands() -> Dict[str, Dict[str, Any]]: def get_skill_commands() -> Dict[str, Dict[str, Any]]: - """Return the current skill commands mapping (scan first if empty).""" - if not _skill_commands: + """Return the current skill commands mapping (scan first if empty). + + Rescans when the active platform scope changes (e.g. a gateway + process serving Telegram and Discord concurrently) so each platform + sees its own ``skills.platform_disabled`` view (#14536). + """ + if ( + not _skill_commands + or _skill_commands_platform != _resolve_skill_commands_platform() + ): scan_skill_commands() return _skill_commands diff --git a/cli-config.yaml.example b/cli-config.yaml.example index c92be7e26b..963268d4ba 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -121,6 +121,18 @@ model: # # Data policy: "allow" (default) or "deny" to exclude providers that may store data # # data_collection: "deny" +# ============================================================================= +# OpenRouter Response Caching (only applies when using OpenRouter) +# ============================================================================= +# Cache identical API responses at the OpenRouter edge for free instant replays. +# When enabled, identical requests (same model, messages, parameters) return +# cached responses with zero billing. Separate from Anthropic prompt caching. +# See: https://openrouter.ai/docs/guides/features/response-caching +# +# openrouter: +# response_cache: true # Enable response caching (default: true) +# response_cache_ttl: 300 # Cache TTL in seconds, 1-86400 (default: 300) + # ============================================================================= # Git Worktree Isolation # ============================================================================= diff --git a/cli.py b/cli.py index f0ba6fc991..da917ae190 100644 --- a/cli.py +++ b/cli.py @@ -2928,7 +2928,14 @@ class HermesCLI: def _expand_ref(match): path = Path(match.group(1)) - return path.read_text(encoding="utf-8") if path.exists() else match.group(0) + # Use try/except instead of path.exists() to avoid TOCTOU race: + # the paste file may be deleted between check and read, causing + # the input to be silently dropped (#17666). + try: + return path.read_text(encoding="utf-8") + except (OSError, IOError): + logger.warning("Paste file gone or unreadable, returning placeholder: %s", path) + return match.group(0) return paste_ref_re.sub(_expand_ref, text) @@ -11584,7 +11591,7 @@ class HermesCLI: pass # Non-fatal โ€” don't break the main loop except Exception as e: - print(f"Error: {e}") + logger.warning("process_loop unhandled error (msg may be lost): %s", e) # Start processing thread process_thread = threading.Thread(target=process_loop, daemon=True) diff --git a/cron/scheduler.py b/cron/scheduler.py index 4672b24ba7..2cb1547ad3 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -123,9 +123,19 @@ _LOCK_FILE = _LOCK_DIR / ".tick.lock" def _resolve_origin(job: dict) -> Optional[dict]: - """Extract origin info from a job, preserving any extra routing metadata.""" + """Extract origin info from a job, preserving any extra routing metadata. + + Treats non-dict origins (free-form provenance strings, ints, lists from + migration scripts or hand-edited jobs.json) as missing instead of + crashing with ``AttributeError`` on ``origin.get(...)``. Without this + guard, a job tagged with e.g. ``"combined-digest-replaces-x-and-y"`` + crashed every fire attempt with + ``'str' object has no attribute 'get'`` โ€” ``mark_job_run`` recorded the + failure, but the next tick re-loaded the same poisoned origin and + crashed identically until the field was patched manually (#18722). + """ origin = job.get("origin") - if not origin: + if not isinstance(origin, dict): return None platform = origin.get("platform") chat_id = origin.get("chat_id") @@ -147,6 +157,19 @@ def _get_home_target_chat_id(platform_name: str) -> str: return value +def _get_home_target_thread_id(platform_name: str) -> Optional[str]: + """Return the optional thread/topic ID for a platform home target.""" + env_var = _HOME_TARGET_ENV_VARS.get(platform_name.lower()) + if not env_var: + return None + value = os.getenv(f"{env_var}_THREAD_ID", "").strip() + if not value: + legacy = _LEGACY_HOME_TARGET_ENV_VARS.get(env_var) + if legacy: + value = os.getenv(f"{legacy}_THREAD_ID", "").strip() + return value or None + + def _resolve_single_delivery_target(job: dict, deliver_value: str) -> Optional[dict]: """Resolve one concrete auto-delivery target for a cron job.""" @@ -175,7 +198,7 @@ def _resolve_single_delivery_target(job: dict, deliver_value: str) -> Optional[d return { "platform": platform_name, "chat_id": chat_id, - "thread_id": None, + "thread_id": _get_home_target_thread_id(platform_name), } return None @@ -229,7 +252,7 @@ def _resolve_single_delivery_target(job: dict, deliver_value: str) -> Optional[d return { "platform": platform_name, "chat_id": chat_id, - "thread_id": None, + "thread_id": _get_home_target_thread_id(platform_name), } diff --git a/gateway/config.py b/gateway/config.py index 6db8e55d84..6527accec4 100644 --- a/gateway/config.py +++ b/gateway/config.py @@ -186,18 +186,24 @@ class HomeChannel: Default destination for a platform. When a cron job specifies deliver="telegram" without a specific chat ID, - messages are sent to this home channel. + messages are sent to this home channel. Thread-aware platforms may also + store a thread/topic ID so the bare platform target routes to the exact + conversation where /sethome was run. """ platform: Platform chat_id: str name: str # Human-readable name for display + thread_id: Optional[str] = None def to_dict(self) -> Dict[str, Any]: - return { + result = { "platform": self.platform.value, "chat_id": self.chat_id, "name": self.name, } + if self.thread_id: + result["thread_id"] = self.thread_id + return result @classmethod def from_dict(cls, data: Dict[str, Any]) -> "HomeChannel": @@ -205,6 +211,7 @@ class HomeChannel: platform=Platform(data["platform"]), chat_id=str(data["chat_id"]), name=data.get("name", "Home"), + thread_id=str(data["thread_id"]) if data.get("thread_id") else None, ) @@ -1071,6 +1078,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.TELEGRAM, chat_id=telegram_home, name=os.getenv("TELEGRAM_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("TELEGRAM_HOME_CHANNEL_THREAD_ID") or None, ) # Discord @@ -1087,6 +1095,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.DISCORD, chat_id=discord_home, name=os.getenv("DISCORD_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("DISCORD_HOME_CHANNEL_THREAD_ID") or None, ) # Reply threading mode for Discord (off/first/all) @@ -1108,6 +1117,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.WHATSAPP, chat_id=whatsapp_home, name=os.getenv("WHATSAPP_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("WHATSAPP_HOME_CHANNEL_THREAD_ID") or None, ) # Slack @@ -1135,6 +1145,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.SLACK, chat_id=slack_home, name=os.getenv("SLACK_HOME_CHANNEL_NAME", ""), + thread_id=os.getenv("SLACK_HOME_CHANNEL_THREAD_ID") or None, ) # Signal @@ -1155,6 +1166,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.SIGNAL, chat_id=signal_home, name=os.getenv("SIGNAL_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("SIGNAL_HOME_CHANNEL_THREAD_ID") or None, ) # Mattermost @@ -1174,6 +1186,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.MATTERMOST, chat_id=mattermost_home, name=os.getenv("MATTERMOST_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("MATTERMOST_HOME_CHANNEL_THREAD_ID") or None, ) # Matrix @@ -1205,6 +1218,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.MATRIX, chat_id=matrix_home, name=os.getenv("MATRIX_HOME_ROOM_NAME", "Home"), + thread_id=os.getenv("MATRIX_HOME_ROOM_THREAD_ID") or None, ) # Home Assistant @@ -1238,6 +1252,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.EMAIL, chat_id=email_home, name=os.getenv("EMAIL_HOME_ADDRESS_NAME", "Home"), + thread_id=os.getenv("EMAIL_HOME_ADDRESS_THREAD_ID") or None, ) # SMS (Twilio) @@ -1253,6 +1268,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.SMS, chat_id=sms_home, name=os.getenv("SMS_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("SMS_HOME_CHANNEL_THREAD_ID") or None, ) # API Server @@ -1315,6 +1331,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.DINGTALK, chat_id=dingtalk_home, name=os.getenv("DINGTALK_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("DINGTALK_HOME_CHANNEL_THREAD_ID") or None, ) # Feishu / Lark @@ -1342,6 +1359,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.FEISHU, chat_id=feishu_home, name=os.getenv("FEISHU_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("FEISHU_HOME_CHANNEL_THREAD_ID") or None, ) # WeCom (Enterprise WeChat) @@ -1364,6 +1382,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.WECOM, chat_id=wecom_home, name=os.getenv("WECOM_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("WECOM_HOME_CHANNEL_THREAD_ID") or None, ) # WeCom callback mode (self-built apps) @@ -1422,6 +1441,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.WEIXIN, chat_id=weixin_home, name=os.getenv("WEIXIN_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("WEIXIN_HOME_CHANNEL_THREAD_ID") or None, ) # BlueBubbles (iMessage) @@ -1445,6 +1465,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.BLUEBUBBLES, chat_id=bluebubbles_home, name=os.getenv("BLUEBUBBLES_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("BLUEBUBBLES_HOME_CHANNEL_THREAD_ID") or None, ) # QQ (Official Bot API v2) @@ -1482,6 +1503,11 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.QQBOT, chat_id=qq_home, name=os.getenv("QQBOT_HOME_CHANNEL_NAME") or os.getenv(qq_home_name_env, "Home"), + thread_id=( + os.getenv("QQBOT_HOME_CHANNEL_THREAD_ID") + or os.getenv("QQ_HOME_CHANNEL_THREAD_ID") + or None + ), ) # Yuanbao โ€” YUANBAO_APP_ID preferred @@ -1512,6 +1538,7 @@ def _apply_env_overrides(config: GatewayConfig) -> None: platform=Platform.YUANBAO, chat_id=yuanbao_home, name=os.getenv("YUANBAO_HOME_CHANNEL_NAME", "Home"), + thread_id=os.getenv("YUANBAO_HOME_CHANNEL_THREAD_ID") or None, ) yuanbao_dm_policy = os.getenv("YUANBAO_DM_POLICY") if yuanbao_dm_policy: diff --git a/gateway/platforms/_http_client_limits.py b/gateway/platforms/_http_client_limits.py new file mode 100644 index 0000000000..4d8a7c86e9 --- /dev/null +++ b/gateway/platforms/_http_client_limits.py @@ -0,0 +1,84 @@ +"""Shared HTTP client factory for long-lived platform adapters. + +Gateway messaging platforms (QQ Bot, Feishu, WeCom, DingTalk, Signal, +BlueBubbles, WeCom-callback) keep a persistent ``httpx.AsyncClient`` +alive for the adapter's lifetime. That amortises TLS/connection setup +across many API calls, but it also means the process's file-descriptor +pressure is sensitive to how aggressively the pool recycles idle keep- +alive connections. + +httpx's default ``keepalive_expiry`` is 5 seconds. On macOS behind +Cloudflare Warp (and other transparent proxies), peer-initiated FIN can +sit in ``CLOSE_WAIT`` longer than that before the local socket actually +drains โ€” which, multiplied across 7 long-lived adapters plus the LLM +client and MCP clients, walks straight into the default 256 fd limit. +See #18451. + +``platform_httpx_limits()`` returns a tighter ``httpx.Limits`` the +adapter factories use instead of the httpx default. The values chosen: + +* ``max_keepalive_connections=10`` โ€” plenty for any single adapter; + platform APIs rarely parallelise beyond this. +* ``keepalive_expiry=2.0`` โ€” close idle sockets aggressively so a + proxy's lingering CLOSE_WAIT window can't starve the process. + +Override via ``HERMES_GATEWAY_HTTPX_KEEPALIVE_EXPIRY`` / +``HERMES_GATEWAY_HTTPX_MAX_KEEPALIVE`` env vars when tuning under load. +""" + +from __future__ import annotations + +import os + +try: + import httpx +except ImportError: # pragma: no cover โ€” optional dep + httpx = None # type: ignore[assignment] + + +_DEFAULT_KEEPALIVE_EXPIRY_S = 2.0 +_DEFAULT_MAX_KEEPALIVE = 10 + + +def platform_httpx_limits() -> "httpx.Limits | None": + """Return ``httpx.Limits`` tuned for persistent platform-adapter clients. + + Returns ``None`` when httpx isn't importable, so callers can fall + back to httpx's built-in default without a hard dependency on this + helper being reachable. + """ + if httpx is None: + return None + + def _env_float(name: str, default: float) -> float: + raw = os.environ.get(name, "").strip() + if not raw: + return default + try: + val = float(raw) + except (TypeError, ValueError): + return default + return val if val > 0 else default + + def _env_int(name: str, default: int) -> int: + raw = os.environ.get(name, "").strip() + if not raw: + return default + try: + val = int(raw) + except (TypeError, ValueError): + return default + return val if val > 0 else default + + keepalive_expiry = _env_float( + "HERMES_GATEWAY_HTTPX_KEEPALIVE_EXPIRY", _DEFAULT_KEEPALIVE_EXPIRY_S + ) + max_keepalive = _env_int( + "HERMES_GATEWAY_HTTPX_MAX_KEEPALIVE", _DEFAULT_MAX_KEEPALIVE + ) + + return httpx.Limits( + max_keepalive_connections=max_keepalive, + # Leave max_connections at httpx default (100) โ€” plenty of headroom. + keepalive_expiry=keepalive_expiry, + ) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index ef08b05405..78e0dd7e25 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -2489,15 +2489,20 @@ class BasePlatformAdapter(ABC): try: response = await self._message_handler(event) - # Old adapter task (if any) is cancelled AFTER the runner has - # fully handled the command โ€” keeps ordering deterministic. - await self.cancel_session_processing( - session_key, - release_guard=False, - discard_pending=False, - ) _text, _eph_ttl = self._unwrap_ephemeral(response) + # Send the response BEFORE cancelling the old task so the send + # cannot be affected by task-cancellation side effects (race + # condition fix โ€” issue #18912). Previously the send happened + # after cancel_session_processing, which could silently drop the + # "/new" confirmation when an agent was actively running. if _text: + logger.info( + "[%s] Sending command '/%s' response (%d chars) to %s", + self.name, + cmd, + len(_text), + event.source.chat_id, + ) _r = await self._send_with_retry( chat_id=event.source.chat_id, content=_text, @@ -2510,6 +2515,13 @@ class BasePlatformAdapter(ABC): message_id=_r.message_id, ttl_seconds=_eph_ttl, ) + # Old adapter task (if any) is cancelled AFTER the response has + # been sent โ€” keeps ordering deterministic and avoids the race. + await self.cancel_session_processing( + session_key, + release_guard=False, + discard_pending=False, + ) except Exception: # On failure, restore the original guard if one still exists so # we don't leave the session in a half-reset state. diff --git a/gateway/platforms/bluebubbles.py b/gateway/platforms/bluebubbles.py index afcbf1a7e4..31120785c0 100644 --- a/gateway/platforms/bluebubbles.py +++ b/gateway/platforms/bluebubbles.py @@ -162,7 +162,9 @@ class BlueBubblesAdapter(BasePlatformAdapter): return False from aiohttp import web - self.client = httpx.AsyncClient(timeout=30.0) + # Tighter keepalive so idle CLOSE_WAIT drains promptly (#18451). + from gateway.platforms._http_client_limits import platform_httpx_limits + self.client = httpx.AsyncClient(timeout=30.0, limits=platform_httpx_limits()) try: await self._api_get("/api/v1/ping") info = await self._api_get("/api/v1/server/info") diff --git a/gateway/platforms/dingtalk.py b/gateway/platforms/dingtalk.py index 3037e402b2..f1520e22c6 100644 --- a/gateway/platforms/dingtalk.py +++ b/gateway/platforms/dingtalk.py @@ -228,7 +228,11 @@ class DingTalkAdapter(BasePlatformAdapter): return False try: - self._http_client = httpx.AsyncClient(timeout=30.0) + # Tighter keepalive so idle CLOSE_WAIT drains promptly (#18451). + from gateway.platforms._http_client_limits import platform_httpx_limits + self._http_client = httpx.AsyncClient( + timeout=30.0, limits=platform_httpx_limits(), + ) credential = dingtalk_stream.Credential( self._client_id, self._client_secret diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index 60cfb55ef6..243e81d3e8 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -497,6 +497,7 @@ class DiscordAdapter(BasePlatformAdapter): self._ready_event = asyncio.Event() self._allowed_user_ids: set = set() # For button approval authorization self._allowed_role_ids: set = set() # For DISCORD_ALLOWED_ROLES filtering + self.gateway_runner = None # Set by gateway/run.py for cross-platform delivery # Voice channel state (per-guild) self._voice_clients: Dict[int, Any] = {} # guild_id -> VoiceClient self._voice_locks: Dict[int, asyncio.Lock] = {} # guild_id -> serialize join/leave @@ -613,6 +614,21 @@ class DiscordAdapter(BasePlatformAdapter): # so LLM output or echoed user content can't ping the whole # server; override per DISCORD_ALLOW_MENTION_* env vars or the # discord.allow_mentions.* block in config.yaml. + + # Close any existing client to prevent zombie websocket connections + # on reconnect (see #18187). Without this, the old client remains + # connected to Discord gateway and both fire on_message, causing + # double responses. + if self._client is not None: + try: + if not self._client.is_closed(): + await self._client.close() + except Exception: + logger.debug("[%s] Failed to close previous Discord client", self.name) + finally: + self._client = None + self._ready_event.clear() + self._client = commands.Bot( command_prefix="!", # Not really used, we handle raw messages intents=intents, @@ -1914,6 +1930,225 @@ class DiscordAdapter(BasePlatformAdapter): return True return False + # โ”€โ”€ Slash command authorization โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # Slash commands (``_run_simple_slash`` and ``_handle_thread_create_slash``) + # are a separate Discord interaction surface from regular messages and + # historically ran with NO authorization check โ€” bypassing every gate + # ``on_message`` enforces (DISCORD_ALLOWED_USERS, DISCORD_ALLOWED_ROLES, + # DISCORD_ALLOWED_CHANNELS, DISCORD_IGNORED_CHANNELS). Any guild member + # could invoke ``/background``, ``/restart``, ``/sethome``, etc. as the + # operator. ``_check_slash_authorization`` mirrors the on_message gates + # one-for-one so the slash surface honors the same trust boundary. + # + # By design, this is a no-op for deployments with no allowlist env vars + # set โ€” ``_is_allowed_user`` returns True and the channel checks early-out + # โ€” preserving the existing "single-tenant, all guild members trusted" + # default. Deployments that DO set any DISCORD_ALLOWED_* var get slash + # parity with on_message. + + def _evaluate_slash_authorization( + self, interaction: "discord.Interaction", + ) -> Tuple[bool, Optional[str]]: + """Evaluate slash authorization without producing any response. + + Returns ``(allowed, reason)``. ``reason`` is populated only when + ``allowed`` is False. This is the shared core used by both the + responding wrapper (``_check_slash_authorization``) and side-effect- + free callers like the ``/skill`` autocomplete callback, which must + return an empty list for unauthorized users instead of leaking an + ephemeral rejection per-keystroke. + + Fail-closed semantics for malformed payloads: when an allowlist is + configured but the interaction is missing the data needed to + evaluate it (no channel id with channel policy active, no user + with user/role policy active), the gate REJECTS rather than + falling through. Without these guards a guild interaction that + happens to deserialize without a channel id would silently bypass + ``DISCORD_ALLOWED_CHANNELS`` and a payload missing ``user`` would + raise ``AttributeError`` in the user check below, surfacing as + an opaque interaction failure rather than a clean rejection. + """ + chan_obj = getattr(interaction, "channel", None) + in_dm = isinstance(chan_obj, discord.DMChannel) if chan_obj is not None else False + + # โ”€โ”€ Channel scope (mirrors on_message lines 3374-3388) โ”€โ”€ + # DMs aren't channel-gated โ€” DMs follow on_message's DM lockdown + # path which has its own user-allowlist enforcement. + if not in_dm: + chan_id_raw = getattr(interaction, "channel_id", None) or getattr( + chan_obj, "id", None, + ) + channel_ids: set = set() + if chan_id_raw is not None: + channel_ids.add(str(chan_id_raw)) + # Mirror on_message: also test the parent channel for threads + # so per-channel allow/deny lists work consistently. + if isinstance(chan_obj, discord.Thread): + parent_id = self._get_parent_channel_id(chan_obj) + if parent_id: + channel_ids.add(str(parent_id)) + + allowed_raw = os.getenv("DISCORD_ALLOWED_CHANNELS", "") + if allowed_raw: + allowed = {c.strip() for c in allowed_raw.split(",") if c.strip()} + if "*" not in allowed: + if not channel_ids: + # Channel policy is configured but the interaction + # has no resolvable channel id. Fail closed. + return ( + False, + "channel id missing with DISCORD_ALLOWED_CHANNELS configured", + ) + if not (channel_ids & allowed): + return (False, "channel not in DISCORD_ALLOWED_CHANNELS") + + # Ignored beats allowed: even when a thread's parent channel + # is on the allowlist, an explicit DISCORD_IGNORED_CHANNELS + # entry on the thread or its parent rejects the interaction. + ignored_raw = os.getenv("DISCORD_IGNORED_CHANNELS", "") + if ignored_raw and channel_ids: + ignored = {c.strip() for c in ignored_raw.split(",") if c.strip()} + if "*" in ignored or (channel_ids & ignored): + return (False, "channel in DISCORD_IGNORED_CHANNELS") + + # โ”€โ”€ User / role allowlist (mirrors on_message line 681) โ”€โ”€ + user = getattr(interaction, "user", None) + allowed_users = getattr(self, "_allowed_user_ids", set()) or set() + allowed_roles = getattr(self, "_allowed_role_ids", set()) or set() + if user is None or getattr(user, "id", None) is None: + # No identifiable user. With any user/role allowlist + # configured, fail closed rather than raise AttributeError + # on ``interaction.user.id`` below. With no allowlist this + # is the existing "no allowlist = everyone" backwards-compat. + if allowed_users or allowed_roles: + return (False, "missing interaction.user with allowlist configured") + return (True, None) + + user_id = str(user.id) + if not self._is_allowed_user(user_id, author=user): + return ( + False, + "user not in DISCORD_ALLOWED_USERS / DISCORD_ALLOWED_ROLES", + ) + + return (True, None) + + async def _check_slash_authorization( + self, interaction: "discord.Interaction", command_text: str, + ) -> bool: + """Mirror on_message's user/role/channel gates onto a slash invocation. + + Returns True to proceed. Returns False *after* sending an ephemeral + rejection, logging a warning, and scheduling a cross-platform admin + alert โ€” the caller must stop on False (the interaction has already + been responded to). + """ + allowed, reason = self._evaluate_slash_authorization(interaction) + if allowed: + return True + return await self._reject_slash( + interaction, command_text, reason=reason or "unauthorized", + ) + + async def _reject_slash( + self, interaction: "discord.Interaction", command_text: str, *, reason: str, + ) -> bool: + """Send ephemeral reject + log warning + schedule admin alert. Returns False. + + Tolerates a missing ``interaction.user`` -- the fail-closed branch + in ``_evaluate_slash_authorization`` deliberately routes here for + malformed payloads (no user) when an allowlist is configured, and + ``str(interaction.user.id)`` would raise AttributeError before the + ephemeral rejection could be sent. + """ + user = getattr(interaction, "user", None) + if user is not None: + user_id = str(getattr(user, "id", "?")) + user_name = getattr(user, "name", "?") + else: + user_id = "?" + user_name = "?" + chan_id = getattr(interaction, "channel_id", None) or getattr( + getattr(interaction, "channel", None), "id", None, + ) + guild_id = getattr(interaction, "guild_id", None) + + logger.warning( + "[Discord] Unauthorized slash attempt: user=%s id=%s channel=%s " + "guild=%s cmd=%r reason=%r", + user_name, user_id, chan_id, guild_id, command_text, reason, + ) + + try: + await interaction.response.send_message( + "You're not authorized to use this command.", + ephemeral=True, + ) + except Exception as e: + # Interaction may already be responded to (e.g. caller deferred + # before the auth check, or Discord retried). Best-effort only. + logger.debug("[Discord] Could not send unauthorized ephemeral: %s", e) + + # Fire-and-forget: don't block the interaction handler on Telegram I/O. + try: + asyncio.create_task(self._notify_unauthorized_slash( + user_name, user_id, chan_id, guild_id, command_text, reason, + )) + except Exception as e: + logger.debug("[Discord] Could not schedule admin notify task: %s", e) + + return False + + async def _notify_unauthorized_slash( + self, user_name: str, user_id: str, chan_id, guild_id, + command_text: str, reason: str, + ) -> None: + """Best-effort cross-platform alert to the gateway operator. + + Tries TELEGRAM first (most operators set TELEGRAM_HOME_CHANNEL), + then SLACK. Silently no-ops if no other platform is configured + with a home channel. + + A soft send failure -- adapter.send() returning a result with + ``success=False`` rather than raising -- continues the fallback + chain. Treating a SendResult(success=False) as delivered would + mean a Telegram outage that the adapter politely surfaces (e.g. + rate-limit, auth failure) silently swallows the alert without + attempting Slack. Hard exceptions still take the same path via + the except branch below. + """ + runner = getattr(self, "gateway_runner", None) + if not runner: + return + for target in (Platform.TELEGRAM, Platform.SLACK): + try: + adapter = runner.adapters.get(target) + if not adapter: + continue + home = runner.config.get_home_channel(target) + if not home or not getattr(home, "chat_id", None): + continue + msg = ( + "โš ๏ธ Unauthorized Discord slash attempt\n" + f"User: {user_name} ({user_id})\n" + f"Channel: {chan_id} (guild {guild_id})\n" + f"Command: {command_text}\n" + f"Reason: {reason}" + ) + result = await adapter.send(str(home.chat_id), msg) + # Only return on confirmed delivery. SendResult(success=False) + # -> continue to the next platform. + if getattr(result, "success", None) is False: + logger.debug( + "[Discord] Admin notify via %s returned success=False" + " (error=%r); falling through", + target, getattr(result, "error", None), + ) + continue + return + except Exception as e: + logger.debug("[Discord] Admin notify via %s failed: %s", target, e) + async def send_image_file( self, chat_id: str, @@ -2301,6 +2536,11 @@ class DiscordAdapter(BasePlatformAdapter): except Exception: pass # logging must never block command dispatch + # Auth gate โ€” must run before defer() so an ephemeral rejection can + # be delivered on the still-unresponded interaction. + if not await self._check_slash_authorization(interaction, command_text): + return + await interaction.response.defer(ephemeral=True) event = self._build_slash_event(interaction, command_text) await self.handle_message(event) @@ -2445,7 +2685,8 @@ class DiscordAdapter(BasePlatformAdapter): message: str = "", auto_archive_duration: int = 1440, ): - await interaction.response.defer(ephemeral=True) + # defer() is performed inside the handler *after* the auth gate + # so a rejected invoker can receive an ephemeral rejection. await self._handle_thread_create_slash(interaction, name, message, auto_archive_duration) @tree.command(name="queue", description="Queue a prompt for the next turn (doesn't interrupt)") @@ -2566,6 +2807,54 @@ class DiscordAdapter(BasePlatformAdapter): # supporting up to 25 categories ร— 25 skills = 625 skills. self._register_skill_group(tree) + # Optional defense-in-depth: hide every slash command from non-admin + # guild members in Discord's slash picker. Server-side authorization + # (``_check_slash_authorization``) is the actual gate; this is purely + # UX so users don't see commands they can't invoke. Off by default + # to preserve the slash UX for deployments that intentionally allow + # everyone in the guild. + if os.getenv("DISCORD_HIDE_SLASH_COMMANDS", "false").strip().lower() in ( + "true", "1", "yes", "on", + ): + self._apply_owner_only_visibility(tree) + + def _apply_owner_only_visibility(self, tree) -> None: + """Set default_member_permissions=0 on every registered slash command. + + Discord interprets ``Permissions(0)`` as "requires no permissions", + which paradoxically means the command is hidden from every guild + member except those with the Administrator permission. Server admins + can re-grant per user/role via Server Settings โ†’ Integrations โ†’ + โ†’ Permissions. + + Authoritative gate is ``_check_slash_authorization`` on every + invocation, which catches stale clients, role grants made by + mistake, and direct API calls bypassing Discord's UI hide. + """ + try: + no_perms = discord.Permissions(0) + except Exception as e: + logger.warning( + "[Discord] _apply_owner_only_visibility: cannot build Permissions(0): %s", + e, + ) + return + applied = 0 + for cmd in tree.get_commands(): + try: + cmd.default_permissions = no_perms + applied += 1 + except Exception as e: + logger.debug( + "[Discord] Could not set default_permissions on %r: %s", + getattr(cmd, "name", "?"), e, + ) + logger.info( + "[Discord] Hid %d slash command(s) from non-admin guild members " + "(opt-in defense in depth via DISCORD_HIDE_SLASH_COMMANDS).", + applied, + ) + def _register_skill_group(self, tree) -> None: """Register a single ``/skill`` command with autocomplete on the name. @@ -2584,40 +2873,32 @@ class DiscordAdapter(BasePlatformAdapter): hidden skills. The slash picker also becomes more discoverable โ€” Discord live-filters by the user's typed prefix against both the skill name and its description. + + The entries list and lookup dict are stored on ``self`` rather + than captured in closure variables so :meth:`refresh_skill_group` + can repopulate them when the user runs ``/reload-skills`` without + needing to touch the Discord slash-command tree or trigger a + ``tree.sync()`` call. """ try: - from hermes_cli.commands import discord_skill_commands_by_category - existing_names = set() try: existing_names = {cmd.name for cmd in tree.get_commands()} except Exception: pass - # Reuse the existing collector for consistent filtering - # (per-platform disabled, hub-excluded, name clamping), then - # flatten โ€” the category grouping was only useful for the - # nested layout. - categories, uncategorized, hidden = discord_skill_commands_by_category( - reserved_names=existing_names, - ) - entries: list[tuple[str, str, str]] = list(uncategorized) - for cat_skills in categories.values(): - entries.extend(cat_skills) + # Populate the instance-level entries/lookup so the + # autocomplete + handler callbacks below always read the + # freshest state. refresh_skill_group() re-runs the same + # collector and mutates these two attributes in place. + self._skill_entries: list[tuple[str, str, str]] = [] + self._skill_lookup: dict[str, tuple[str, str]] = {} + self._skill_group_reserved_names: set[str] = set(existing_names) + self._refresh_skill_catalog_state() - if not entries: + if not self._skill_entries: return - # Stable alphabetical order so the autocomplete suggestion - # list is predictable across restarts. - entries.sort(key=lambda t: t[0]) - - # name -> (description, cmd_key) โ€” used by both the autocomplete - # callback and the handler for O(1) dispatch. - skill_lookup: dict[str, tuple[str, str]] = { - n: (d, k) for n, d, k in entries - } - async def _autocomplete_name( interaction: "discord.Interaction", current: str, ) -> list: @@ -2627,10 +2908,29 @@ class DiscordAdapter(BasePlatformAdapter): "/skill pdf" surfaces skills whose description mentions PDFs even if the name doesn't. Discord caps this list at 25 entries per query. + + Authorization: a quiet pre-check evaluates the slash + allowlists and returns ``[]`` for unauthorized users so + the installed skill catalog is not leaked to anyone who + can see the command in the picker. Returning a generic + empty list here is intentional โ€” sending a per-keystroke + ephemeral rejection would produce a barrage of error + popups during typing. + + Reads ``self._skill_entries`` so a ``/reload-skills`` run + since process start shows up on the very next keystroke. """ + try: + allowed, _reason = self._evaluate_slash_authorization(interaction) + except Exception: + # Defensive: never raise from autocomplete. Fail + # closed by returning an empty suggestion list. + return [] + if not allowed: + return [] q = (current or "").strip().lower() choices: list = [] - for name, desc, _key in entries: + for name, desc, _key in self._skill_entries: if not q or q in name.lower() or (desc and q in desc.lower()): if desc: label = f"{name} โ€” {desc}" @@ -2654,7 +2954,13 @@ class DiscordAdapter(BasePlatformAdapter): async def _skill_handler( interaction: "discord.Interaction", name: str, args: str = "", ): - entry = skill_lookup.get(name) + # Authorize BEFORE any skill lookup so that known and + # unknown skill names produce identical rejections for + # unauthorized users (no probing the installed catalog + # via "Unknown skill: " responses). + if not await self._check_slash_authorization(interaction, "/skill"): + return + entry = self._skill_lookup.get(name) if not entry: await interaction.response.send_message( f"Unknown skill: `{name}`. Start typing for " @@ -2676,16 +2982,74 @@ class DiscordAdapter(BasePlatformAdapter): logger.info( "[%s] Registered /skill command with %d skill(s) via autocomplete", - self.name, len(entries), + self.name, len(self._skill_entries), ) - if hidden: + if self._skill_group_hidden_count: logger.info( "[%s] %d skill(s) filtered out of /skill (name clamp / reserved)", - self.name, hidden, + self.name, self._skill_group_hidden_count, ) except Exception as exc: logger.warning("[%s] Failed to register /skill command: %s", self.name, exc) + def _refresh_skill_catalog_state(self) -> None: + """Re-scan disk for skills and repopulate ``self._skill_entries``. + + Called once from :meth:`_register_skill_group` at startup and + again from :meth:`refresh_skill_group` whenever the user runs + ``/reload-skills``. No Discord API calls are made โ€” autocomplete + and the handler both read from these instance attributes + directly, so an in-place mutation is sufficient. + """ + from hermes_cli.commands import discord_skill_commands_by_category + + reserved = getattr(self, "_skill_group_reserved_names", set()) + categories, uncategorized, hidden = discord_skill_commands_by_category( + reserved_names=set(reserved), + ) + entries: list[tuple[str, str, str]] = list(uncategorized) + for cat_skills in categories.values(): + entries.extend(cat_skills) + # Stable alphabetical order so the autocomplete suggestion + # list is predictable across restarts. + entries.sort(key=lambda t: t[0]) + + self._skill_entries = entries + self._skill_lookup = {n: (d, k) for n, d, k in entries} + self._skill_group_hidden_count = hidden + + def refresh_skill_group(self) -> tuple[int, int]: + """Rescan skills and update the live ``/skill`` autocomplete state. + + Invoked by :meth:`gateway.run.GatewayOrchestrator._handle_reload_skills_command` + after :func:`agent.skill_commands.reload_skills` has refreshed + the in-process skill-command registry. Without this call, the + ``/skill`` autocomplete dropdown keeps showing the list captured + at process start โ€” new skills stay invisible and deleted skills + return an "Unknown skill" error when clicked. + + Because autocomplete options are fetched dynamically by Discord, + we only need to mutate the entries/lookup attributes read by the + callbacks โ€” no ``tree.sync()`` is required. + + Returns ``(new_count, hidden_count)``. + """ + try: + self._refresh_skill_catalog_state() + except Exception as exc: + logger.warning( + "[%s] Failed to refresh /skill autocomplete after reload: %s", + self.name, exc, + ) + return (len(getattr(self, "_skill_entries", [])), 0) + logger.info( + "[%s] Refreshed /skill autocomplete: %d skill(s) available (%d filtered)", + self.name, + len(self._skill_entries), + self._skill_group_hidden_count, + ) + return (len(self._skill_entries), self._skill_group_hidden_count) + def _build_slash_event(self, interaction: discord.Interaction, text: str) -> MessageEvent: """Build a MessageEvent from a Discord slash command interaction.""" is_dm = isinstance(interaction.channel, discord.DMChannel) @@ -2743,6 +3107,9 @@ class DiscordAdapter(BasePlatformAdapter): auto_archive_duration: int = 1440, ) -> None: """Create a Discord thread from a slash command and start a session in it.""" + if not await self._check_slash_authorization(interaction, "/thread"): + return + await interaction.response.defer(ephemeral=True) result = await self._create_thread( interaction, name=name, @@ -3037,6 +3404,7 @@ class DiscordAdapter(BasePlatformAdapter): view = ExecApprovalView( session_key=session_key, allowed_user_ids=self._allowed_user_ids, + allowed_role_ids=self._allowed_role_ids, ) msg = await channel.send(embed=embed, view=view) @@ -3075,6 +3443,7 @@ class DiscordAdapter(BasePlatformAdapter): session_key=session_key, confirm_id=confirm_id, allowed_user_ids=self._allowed_user_ids, + allowed_role_ids=self._allowed_role_ids, ) msg = await channel.send(embed=embed, view=view) @@ -3109,6 +3478,7 @@ class DiscordAdapter(BasePlatformAdapter): view = UpdatePromptView( session_key=session_key, allowed_user_ids=self._allowed_user_ids, + allowed_role_ids=self._allowed_role_ids, ) msg = await channel.send(embed=embed, view=view) return SendResult(success=True, message_id=str(msg.id)) @@ -3166,6 +3536,7 @@ class DiscordAdapter(BasePlatformAdapter): session_key=session_key, on_model_selected=on_model_selected, allowed_user_ids=self._allowed_user_ids, + allowed_role_ids=self._allowed_role_ids, ) msg = await channel.send(embed=embed, view=view) @@ -3721,6 +4092,72 @@ class DiscordAdapter(BasePlatformAdapter): # Discord UI Components (outside the adapter class) # --------------------------------------------------------------------------- + +def _component_check_auth( + interaction, + allowed_user_ids: Optional[set], + allowed_role_ids: Optional[set], +) -> bool: + """Shared user-or-role OR semantics for component view button clicks. + + Mirrors ``DiscordAdapter._is_allowed_user`` / the slash and on_message + gates so every Discord interaction surface honors the same trust + boundary. Component views (ExecApprovalView, SlashConfirmView, + UpdatePromptView, ModelPickerView) used to receive only + ``allowed_user_ids``: in role-only deployments + (DISCORD_ALLOWED_ROLES set, DISCORD_ALLOWED_USERS empty) the user + set was empty and the legacy "no allowlist = allow everyone" branch + let any guild member click the buttons -- approving exec commands, + cancelling slash confirmations, switching the model. + + Behavior: + + - both allowlists empty -> allow (preserves existing no-allowlist + deployments, no regression) + - user is in user allowlist -> allow + - role allowlist set + user has a role in it -> allow + - role allowlist set + interaction.user has no resolvable + ``roles`` attribute (e.g. DM context with a role policy active) + -> reject (fail closed) + - otherwise -> reject + """ + user_set = allowed_user_ids or set() + role_set = allowed_role_ids or set() + has_users = bool(user_set) + has_roles = bool(role_set) + if not has_users and not has_roles: + return True + + user = getattr(interaction, "user", None) + if user is None: + return False + + if has_users: + try: + uid = str(user.id) + except AttributeError: + uid = "" + if uid and uid in user_set: + return True + + if has_roles: + roles_attr = getattr(user, "roles", None) + if roles_attr is None: + # Role policy is configured but the interaction doesn't + # carry role data (DM-context Member, raw User payload). + # Fail closed: a user without a resolvable role list cannot + # satisfy a role allowlist. + return False + try: + user_role_ids = {getattr(r, "id", None) for r in roles_attr} + except TypeError: + return False + if user_role_ids & role_set: + return True + + return False + + if DISCORD_AVAILABLE: class ExecApprovalView(discord.ui.View): @@ -3733,17 +4170,23 @@ if DISCORD_AVAILABLE: Only users in the allowed list can click. Times out after 5 minutes. """ - def __init__(self, session_key: str, allowed_user_ids: set): + def __init__( + self, + session_key: str, + allowed_user_ids: set, + allowed_role_ids: Optional[set] = None, + ): super().__init__(timeout=300) # 5-minute timeout self.session_key = session_key self.allowed_user_ids = allowed_user_ids + self.allowed_role_ids = allowed_role_ids or set() self.resolved = False def _check_auth(self, interaction: discord.Interaction) -> bool: """Verify the user clicking is authorized.""" - if not self.allowed_user_ids: - return True # No allowlist = anyone can approve - return str(interaction.user.id) in self.allowed_user_ids + return _component_check_auth( + interaction, self.allowed_user_ids, self.allowed_role_ids, + ) async def _resolve( self, interaction: discord.Interaction, choice: str, @@ -3835,17 +4278,24 @@ if DISCORD_AVAILABLE: 5 minutes (matches the gateway primitive's timeout). """ - def __init__(self, session_key: str, confirm_id: str, allowed_user_ids: set): + def __init__( + self, + session_key: str, + confirm_id: str, + allowed_user_ids: set, + allowed_role_ids: Optional[set] = None, + ): super().__init__(timeout=300) self.session_key = session_key self.confirm_id = confirm_id self.allowed_user_ids = allowed_user_ids + self.allowed_role_ids = allowed_role_ids or set() self.resolved = False def _check_auth(self, interaction: discord.Interaction) -> bool: - if not self.allowed_user_ids: - return True - return str(interaction.user.id) in self.allowed_user_ids + return _component_check_auth( + interaction, self.allowed_user_ids, self.allowed_role_ids, + ) async def _resolve( self, interaction: discord.Interaction, choice: str, @@ -3923,16 +4373,22 @@ if DISCORD_AVAILABLE: 5-minute timeout on its side). """ - def __init__(self, session_key: str, allowed_user_ids: set): + def __init__( + self, + session_key: str, + allowed_user_ids: set, + allowed_role_ids: Optional[set] = None, + ): super().__init__(timeout=300) self.session_key = session_key self.allowed_user_ids = allowed_user_ids + self.allowed_role_ids = allowed_role_ids or set() self.resolved = False def _check_auth(self, interaction: discord.Interaction) -> bool: - if not self.allowed_user_ids: - return True - return str(interaction.user.id) in self.allowed_user_ids + return _component_check_auth( + interaction, self.allowed_user_ids, self.allowed_role_ids, + ) async def _respond( self, interaction: discord.Interaction, answer: str, @@ -4009,6 +4465,7 @@ if DISCORD_AVAILABLE: session_key: str, on_model_selected, allowed_user_ids: set, + allowed_role_ids: Optional[set] = None, ): super().__init__(timeout=120) self.providers = providers @@ -4017,15 +4474,16 @@ if DISCORD_AVAILABLE: self.session_key = session_key self.on_model_selected = on_model_selected self.allowed_user_ids = allowed_user_ids + self.allowed_role_ids = allowed_role_ids or set() self.resolved = False self._selected_provider: str = "" self._build_provider_select() def _check_auth(self, interaction: discord.Interaction) -> bool: - if not self.allowed_user_ids: - return True - return str(interaction.user.id) in self.allowed_user_ids + return _component_check_auth( + interaction, self.allowed_user_ids, self.allowed_role_ids, + ) def _build_provider_select(self): """Build the provider dropdown menu.""" diff --git a/gateway/platforms/feishu.py b/gateway/platforms/feishu.py index 8bc2ae816e..a6b522c4a2 100644 --- a/gateway/platforms/feishu.py +++ b/gateway/platforms/feishu.py @@ -2922,13 +2922,18 @@ class FeishuAdapter(BasePlatformAdapter): }, ) response.raise_for_status() + # Snapshot Content-Type and body while the client context is + # still active so pooled connections fully release on exit. + # See #18451. + content_type_hdr = str(response.headers.get("Content-Type", "")) + body = response.content filename = self._derive_remote_filename( file_url, - content_type=str(response.headers.get("Content-Type", "")), + content_type=content_type_hdr, default_name=preferred_name, default_ext=default_ext, ) - cached_path = cache_document_from_bytes(response.content, filename) + cached_path = cache_document_from_bytes(body, filename) return cached_path, filename @staticmethod diff --git a/gateway/platforms/homeassistant.py b/gateway/platforms/homeassistant.py index 746465594c..6bc9ae6eb6 100644 --- a/gateway/platforms/homeassistant.py +++ b/gateway/platforms/homeassistant.py @@ -139,7 +139,7 @@ class HomeAssistantAdapter(BasePlatformAdapter): async def _ws_connect(self) -> bool: """Establish WebSocket connection and authenticate.""" - ws_url = self._hass_url.replace("http://", "ws://").replace("https://", "wss://") + ws_url = self._hass_url.replace("https://", "wss://").replace("http://", "ws://") ws_url = f"{ws_url}/api/websocket" self._session = aiohttp.ClientSession( diff --git a/gateway/platforms/qqbot/adapter.py b/gateway/platforms/qqbot/adapter.py index 10e1f62e72..c6e5d428c6 100644 --- a/gateway/platforms/qqbot/adapter.py +++ b/gateway/platforms/qqbot/adapter.py @@ -243,10 +243,14 @@ class QQAdapter(BasePlatformAdapter): return False try: + # Tighter keepalive pool so idle CLOSE_WAIT sockets drain + # faster behind proxies like Cloudflare Warp (#18451). + from gateway.platforms._http_client_limits import platform_httpx_limits self._http_client = httpx.AsyncClient( timeout=30.0, follow_redirects=True, event_hooks={"response": [_ssrf_redirect_guard]}, + limits=platform_httpx_limits(), ) # 1. Get access token diff --git a/gateway/platforms/signal.py b/gateway/platforms/signal.py index 225430600d..77d3c18cb6 100644 --- a/gateway/platforms/signal.py +++ b/gateway/platforms/signal.py @@ -248,7 +248,9 @@ class SignalAdapter(BasePlatformAdapter): except Exception as e: logger.warning("Signal: Could not acquire phone lock (non-fatal): %s", e) - self.client = httpx.AsyncClient(timeout=30.0) + # Tighter keepalive so idle CLOSE_WAIT drains promptly (#18451). + from gateway.platforms._http_client_limits import platform_httpx_limits + self.client = httpx.AsyncClient(timeout=30.0, limits=platform_httpx_limits()) try: # Health check โ€” verify signal-cli daemon is reachable try: diff --git a/gateway/platforms/slack.py b/gateway/platforms/slack.py index 3208a80a6a..c8ee28859d 100644 --- a/gateway/platforms/slack.py +++ b/gateway/platforms/slack.py @@ -528,6 +528,21 @@ class SlackAdapter(BasePlatformAdapter): return False lock_acquired = True + # Close any previous handler before creating a new one so that + # calling connect() a second time (e.g. during a gateway restart or + # in-process reconnect attempt) does not leave a zombie Socket Mode + # connection alive. Both the old and new connections would otherwise + # receive every Slack event and dispatch it twice, producing double + # responses โ€” the same bug that affected DiscordAdapter (#18187). + if self._handler is not None: + try: + await self._handler.close_async() + except Exception: + logger.debug("[%s] Failed to close previous Slack handler", self.name) + finally: + self._handler = None + self._app = None + # First token is the primary โ€” used for AsyncApp / Socket Mode primary_token = bot_tokens[0] self._app = AsyncApp(token=primary_token) diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index cbee25393e..188038a1ad 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -512,6 +512,17 @@ class TelegramAdapter(BasePlatformAdapter): self.name, attempt, ) self._polling_network_error_count = 0 + # start_polling() returning is necessary but not sufficient: + # PTB's Updater can be left in a state where `running` is True + # but the underlying long-poll task is wedged on a stale httpx + # connection and never makes progress. No error_callback fires + # in that state, so the reconnect ladder won't advance on its + # own. Schedule a deferred probe to detect the wedge and + # re-enter the ladder if needed. + if not self.has_fatal_error: + probe = asyncio.ensure_future(self._verify_polling_after_reconnect()) + self._background_tasks.add(probe) + probe.add_done_callback(self._background_tasks.discard) except Exception as retry_err: logger.warning("[%s] Telegram polling reconnect failed: %s", self.name, retry_err) # start_polling failed โ€” polling is dead and no further error @@ -523,6 +534,50 @@ class TelegramAdapter(BasePlatformAdapter): self._background_tasks.add(task) task.add_done_callback(self._background_tasks.discard) + async def _verify_polling_after_reconnect(self) -> None: + """Heartbeat probe scheduled after a successful reconnect. + + PTB's Updater can survive a botched stop()+start_polling() cycle + with `running=True` but a wedged consumer task. No error callback + fires, so the reconnect ladder doesn't advance on its own. This + probe detects the wedge by: + + 1. Sleeping HEARTBEAT_PROBE_DELAY so a healthy long-poll has time + to complete at least one cycle. + 2. Verifying `Updater.running` is still True. + 3. Probing the bot endpoint with a tight asyncio timeout. A + wedged httpx pool fails this probe; a healthy one returns + well under the timeout. + + On any failure, re-enter the reconnect ladder so the existing + MAX_NETWORK_RETRIES path can ultimately escalate to fatal-error. + """ + HEARTBEAT_PROBE_DELAY = 60 + PROBE_TIMEOUT = 10 + + await asyncio.sleep(HEARTBEAT_PROBE_DELAY) + + if self.has_fatal_error: + return + if not (self._app and self._app.updater and self._app.updater.running): + logger.warning( + "[%s] Updater not running %ds after reconnect โ€” treating as wedged", + self.name, HEARTBEAT_PROBE_DELAY, + ) + await self._handle_polling_network_error( + RuntimeError("Updater not running after reconnect heartbeat") + ) + return + + try: + await asyncio.wait_for(self._app.bot.get_me(), PROBE_TIMEOUT) + except Exception as probe_err: + logger.warning( + "[%s] Polling heartbeat probe failed %ds after reconnect: %s", + self.name, HEARTBEAT_PROBE_DELAY, probe_err, + ) + await self._handle_polling_network_error(probe_err) + async def _handle_polling_conflict(self, error: Exception) -> None: if self.has_fatal_error and self.fatal_error_code == "telegram_polling_conflict": return diff --git a/gateway/platforms/wecom.py b/gateway/platforms/wecom.py index 7ba0fa21b9..453b95a717 100644 --- a/gateway/platforms/wecom.py +++ b/gateway/platforms/wecom.py @@ -206,7 +206,11 @@ class WeComAdapter(BasePlatformAdapter): return False try: - self._http_client = httpx.AsyncClient(timeout=30.0, follow_redirects=True) + # Tighter keepalive so idle CLOSE_WAIT drains promptly (#18451). + from gateway.platforms._http_client_limits import platform_httpx_limits + self._http_client = httpx.AsyncClient( + timeout=30.0, follow_redirects=True, limits=platform_httpx_limits(), + ) await self._open_connection() self._mark_connected() self._listen_task = asyncio.create_task(self._listen_loop()) diff --git a/gateway/platforms/wecom_callback.py b/gateway/platforms/wecom_callback.py index 5440792dea..139c67fe7c 100644 --- a/gateway/platforms/wecom_callback.py +++ b/gateway/platforms/wecom_callback.py @@ -119,7 +119,9 @@ class WecomCallbackAdapter(BasePlatformAdapter): pass try: - self._http_client = httpx.AsyncClient(timeout=20.0) + # Tighter keepalive so idle CLOSE_WAIT drains promptly (#18451). + from gateway.platforms._http_client_limits import platform_httpx_limits + self._http_client = httpx.AsyncClient(timeout=20.0, limits=platform_httpx_limits()) self._app = web.Application() self._app.router.add_get("/health", self._handle_health) self._app.router.add_get(self._path, self._handle_verify) diff --git a/gateway/platforms/weixin.py b/gateway/platforms/weixin.py index 72b7d2a4df..3fd7174270 100644 --- a/gateway/platforms/weixin.py +++ b/gateway/platforms/weixin.py @@ -2030,7 +2030,9 @@ async def send_weixin_direct( live_adapter = _LIVE_ADAPTERS.get(resolved_token) send_session = getattr(live_adapter, '_send_session', None) - if live_adapter is not None and send_session is not None and not send_session.closed: + if (live_adapter is not None and send_session is not None + and not send_session.closed + and send_session._loop is asyncio.get_running_loop()): last_result: Optional[SendResult] = None cleaned = live_adapter.format_message(message) if cleaned: diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py index a82417a601..921dd70d72 100644 --- a/gateway/platforms/whatsapp.py +++ b/gateway/platforms/whatsapp.py @@ -185,6 +185,13 @@ class WhatsAppAdapter(BasePlatformAdapter): self._bridge_log: Optional[Path] = None self._poll_task: Optional[asyncio.Task] = None self._http_session: Optional["aiohttp.ClientSession"] = None + # Set to True by disconnect() before we SIGTERM our child bridge so + # _check_managed_bridge_exit() can distinguish an intentional + # shutdown-time exit (returncode -15 / -2 / 0) from a real crash. + # Without this, every graceful gateway shutdown/restart would log + # "Fatal whatsapp adapter error" plus dispatch a fatal-error + # notification before the normal "โœ“ whatsapp disconnected" fires. + self._shutting_down: bool = False def _whatsapp_require_mention(self) -> bool: configured = self.config.extra.get("require_mention") @@ -555,6 +562,21 @@ class WhatsAppAdapter(BasePlatformAdapter): if returncode is None: return None + # Planned shutdown: disconnect() sets _shutting_down before it sends + # SIGTERM to the bridge, so a returncode of -15 (SIGTERM), -2 (SIGINT), + # or 0 (clean exit) at that point is expected, not a crash. Treat it + # as informational and skip the fatal-error path. + # getattr-with-default keeps tests that construct the adapter via + # ``WhatsAppAdapter.__new__`` (bypassing __init__) working without + # every _make_adapter() helper having to seed the attribute. + if getattr(self, "_shutting_down", False) and returncode in (0, -2, -15): + logger.info( + "[%s] Bridge exited during shutdown (code %d).", + self.name, + returncode, + ) + return None + message = f"WhatsApp bridge process exited unexpectedly (code {returncode})." if not self.has_fatal_error: logger.error("[%s] %s", self.name, message) @@ -565,6 +587,10 @@ class WhatsAppAdapter(BasePlatformAdapter): async def disconnect(self) -> None: """Stop the WhatsApp bridge and clean up any orphaned processes.""" + # Flip the shutdown flag BEFORE signalling the child so the exit-check + # path (which runs from other tasks like send() and the poll loop) + # doesn't race us and report the intentional termination as fatal. + self._shutting_down = True if self._bridge_process: try: try: @@ -876,11 +902,15 @@ class WhatsAppAdapter(BasePlatformAdapter): try: import aiohttp - await self._http_session.post( + # Must wrap in `async with` โ€” a bare `await session.post(...)` + # leaves the response object alive until GC, holding its TCP + # socket in CLOSE_WAIT. See #18451. + async with self._http_session.post( f"http://127.0.0.1:{self._bridge_port}/typing", json={"chatId": chat_id}, timeout=aiohttp.ClientTimeout(total=5) - ) + ): + pass except Exception: pass # Ignore typing indicator failures diff --git a/gateway/run.py b/gateway/run.py index 88196d6927..d604947e99 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -15,6 +15,7 @@ Usage: import asyncio import dataclasses +import inspect import json import logging import os @@ -282,6 +283,16 @@ def _home_target_env_var(platform_name: str) -> str: ) +def _home_thread_env_var(platform_name: str) -> str: + """Return the optional thread/topic env var for a platform home target.""" + return f"{_home_target_env_var(platform_name)}_THREAD_ID" + + +def _restart_notification_pending() -> bool: + """Return True when a /restart completion marker is waiting to be delivered.""" + return (_hermes_home / ".restart_notify.json").exists() + + _ensure_ssl_certs() # Add parent directory to path @@ -406,37 +417,37 @@ if _config_path.exists(): os.environ[_env_map["base_url"]] = _base_url if _api_key: os.environ[_env_map["api_key"]] = _api_key + # config.yaml is the documented, authoritative source for these + # settings โ€” it unconditionally wins over .env values. Previously + # the guards below read `if X not in os.environ` and let stale + # .env entries (e.g. HERMES_MAX_ITERATIONS=60 written by an old + # `hermes setup` run) silently shadow the user's current config. + # See PR #18413 / the 60-vs-500 max_turns incident. _agent_cfg = _cfg.get("agent", {}) if _agent_cfg and isinstance(_agent_cfg, dict): if "max_turns" in _agent_cfg: os.environ["HERMES_MAX_ITERATIONS"] = str(_agent_cfg["max_turns"]) - # Bridge agent.gateway_timeout โ†’ HERMES_AGENT_TIMEOUT env var. - # Env var from .env takes precedence (already in os.environ). - if "gateway_timeout" in _agent_cfg and "HERMES_AGENT_TIMEOUT" not in os.environ: + if "gateway_timeout" in _agent_cfg: os.environ["HERMES_AGENT_TIMEOUT"] = str(_agent_cfg["gateway_timeout"]) - if "gateway_timeout_warning" in _agent_cfg and "HERMES_AGENT_TIMEOUT_WARNING" not in os.environ: + if "gateway_timeout_warning" in _agent_cfg: os.environ["HERMES_AGENT_TIMEOUT_WARNING"] = str(_agent_cfg["gateway_timeout_warning"]) - if "gateway_notify_interval" in _agent_cfg and "HERMES_AGENT_NOTIFY_INTERVAL" not in os.environ: + if "gateway_notify_interval" in _agent_cfg: os.environ["HERMES_AGENT_NOTIFY_INTERVAL"] = str(_agent_cfg["gateway_notify_interval"]) - if "restart_drain_timeout" in _agent_cfg and "HERMES_RESTART_DRAIN_TIMEOUT" not in os.environ: + if "restart_drain_timeout" in _agent_cfg: os.environ["HERMES_RESTART_DRAIN_TIMEOUT"] = str(_agent_cfg["restart_drain_timeout"]) - if ( - "gateway_auto_continue_freshness" in _agent_cfg - and "HERMES_AUTO_CONTINUE_FRESHNESS" not in os.environ - ): + if "gateway_auto_continue_freshness" in _agent_cfg: os.environ["HERMES_AUTO_CONTINUE_FRESHNESS"] = str( _agent_cfg["gateway_auto_continue_freshness"] ) _display_cfg = _cfg.get("display", {}) if _display_cfg and isinstance(_display_cfg, dict): - if "busy_input_mode" in _display_cfg and "HERMES_GATEWAY_BUSY_INPUT_MODE" not in os.environ: + if "busy_input_mode" in _display_cfg: os.environ["HERMES_GATEWAY_BUSY_INPUT_MODE"] = str(_display_cfg["busy_input_mode"]) - if "busy_ack_enabled" in _display_cfg and "HERMES_GATEWAY_BUSY_ACK_ENABLED" not in os.environ: + if "busy_ack_enabled" in _display_cfg: os.environ["HERMES_GATEWAY_BUSY_ACK_ENABLED"] = str(_display_cfg["busy_ack_enabled"]) # Timezone: bridge config.yaml โ†’ HERMES_TIMEZONE env var. - # HERMES_TIMEZONE from .env takes precedence (already in os.environ). _tz_cfg = _cfg.get("timezone", "") - if _tz_cfg and isinstance(_tz_cfg, str) and "HERMES_TIMEZONE" not in os.environ: + if _tz_cfg and isinstance(_tz_cfg, str): os.environ["HERMES_TIMEZONE"] = _tz_cfg.strip() # Security settings _security_cfg = _cfg.get("security", {}) @@ -444,8 +455,24 @@ if _config_path.exists(): _redact = _security_cfg.get("redact_secrets") if _redact is not None: os.environ["HERMES_REDACT_SECRETS"] = str(_redact).lower() - except Exception: - pass # Non-fatal; gateway can still run with .env values + except Exception as _bridge_err: + # Previously this was silent (`except Exception: pass`), which + # hid partial bridge failures and let .env defaults shadow + # config.yaml values โ€” users observed max_turns=500 in config + # but a 60-iteration cap in practice. Surface the failure to + # stderr so operators see it even though `logger` is not yet + # initialized at module-import time (logger is defined further + # down this module). + print( + f" Warning: config.yaml โ†’ env bridge failed: " + f"{type(_bridge_err).__name__}: {_bridge_err}", + file=sys.stderr, + ) + print( + " Gateway will fall back to .env values, which may not match " + "your current config.yaml. Run `hermes doctor` to investigate.", + file=sys.stderr, + ) # Apply IPv4 preference if configured (before any HTTP clients are created). try: @@ -490,6 +517,8 @@ from gateway.config import ( Platform, _BUILTIN_PLATFORM_VALUES, GatewayConfig, + HomeChannel, + PlatformConfig, load_gateway_config, ) from gateway.session import ( @@ -673,11 +702,69 @@ def _is_control_interrupt_message(message: Optional[str]) -> bool: return normalized in _CONTROL_INTERRUPT_MESSAGES +def _skill_slug_from_frontmatter(skill_md: Path) -> tuple[str | None, str | None]: + """Derive the /command slug and declared frontmatter name from a SKILL.md. + + Matches the exact normalization used by + :func:`agent.skill_commands.scan_skill_commands` so the slug here is the + same string a user types after the leading ``/`` (e.g. a skill with + frontmatter ``name: Stable Diffusion Image Generation`` resolves to + ``stable-diffusion-image-generation`` โ€” NOT the parent directory name, + which is commonly shorter/different, e.g. ``stable-diffusion``). + + Using the directory name silently broke :func:`_check_unavailable_skill` + for every skill whose directory name drifted from its frontmatter name + (19 such skills on a standard install as of 2026-05), causing a generic + "unknown command" response where a "disabled โ€” enable with โ€ฆ" or + "not installed โ€” install with โ€ฆ" hint was expected. + + Returns ``(slug, declared_name)`` or ``(None, None)`` when the file + can't be read or lacks a ``name:`` in its frontmatter. + """ + try: + content = skill_md.read_text(encoding="utf-8", errors="replace") + except Exception: + return None, None + if not content.startswith("---"): + return None, None + end = content.find("\n---", 3) + if end < 0: + return None, None + declared_name: str | None = None + for line in content[3:end].splitlines(): + line = line.strip() + if line.startswith("name:"): + raw = line.split(":", 1)[1].strip() + # Strip YAML quote wrappers if present + if len(raw) >= 2 and raw[0] == raw[-1] and raw[0] in ('"', "'"): + raw = raw[1:-1] + declared_name = raw.strip() + break + if not declared_name: + return None, None + slug = declared_name.lower().replace(" ", "-").replace("_", "-") + # Mirror _SKILL_INVALID_CHARS and _SKILL_MULTI_HYPHEN from skill_commands + import re as _re + slug = _re.sub(r"[^a-z0-9-]", "", slug) + slug = _re.sub(r"-{2,}", "-", slug).strip("-") + if not slug: + return None, declared_name + return slug, declared_name + + def _check_unavailable_skill(command_name: str) -> str | None: """Check if a command matches a known-but-inactive skill. Returns a helpful message if the skill exists but is disabled or only available as an optional install. Returns None if no match found. + + The slug for each on-disk skill is derived from its frontmatter ``name:`` + (via :func:`_skill_slug_from_frontmatter`), NOT from its containing + directory name โ€” because the two can differ (e.g. directory + ``stable-diffusion`` + frontmatter ``Stable Diffusion Image Generation`` + yields slug ``stable-diffusion-image-generation``). Matching on + directory name would miss that slug entirely and fall through to the + generic "unknown command" path. """ # Normalize: command uses hyphens, skill names may use hyphens or underscores normalized = command_name.lower().replace("_", "-") @@ -693,8 +780,12 @@ def _check_unavailable_skill(command_name: str) -> str | None: for skill_md in skills_dir.rglob("SKILL.md"): if any(part in ('.git', '.github', '.hub', '.archive') for part in skill_md.parts): continue - name = skill_md.parent.name.lower().replace("_", "-") - if name == normalized and name in disabled: + slug, declared_name = _skill_slug_from_frontmatter(skill_md) + if not slug or not declared_name: + continue + # disabled is keyed by the declared frontmatter name (what + # skills.disabled / skills.platform_disabled store). + if slug == normalized and declared_name in disabled: return ( f"The **{command_name}** skill is installed but disabled.\n" f"Enable it with: `hermes skills config`" @@ -706,8 +797,10 @@ def _check_unavailable_skill(command_name: str) -> str | None: optional_dir = get_optional_skills_dir(repo_root / "optional-skills") if optional_dir.exists(): for skill_md in optional_dir.rglob("SKILL.md"): - name = skill_md.parent.name.lower().replace("_", "-") - if name == normalized: + slug, _declared = _skill_slug_from_frontmatter(skill_md) + if not slug: + continue + if slug == normalized: # Build install path: official// rel = skill_md.parent.relative_to(optional_dir) parts = list(rel.parts) @@ -2176,15 +2269,13 @@ class GatewayRunner: logger.debug("Failed interrupting agent during shutdown: %s", e) async def _notify_active_sessions_of_shutdown(self) -> None: - """Send a notification to every chat with an active agent. + """Send shutdown/restart notifications to active chats and home channels. Called at the very start of stop() โ€” adapters are still connected so - messages can be delivered. Best-effort: individual send failures are + messages can be delivered. Best-effort: individual send failures are logged and swallowed so they never block the shutdown sequence. """ active = self._snapshot_running_agents() - if not active: - return action = "restarting" if self._restart_requested else "shutting down" hint = ( @@ -2195,7 +2286,7 @@ class GatewayRunner: ) msg = f"โš ๏ธ Gateway {action} โ€” {hint}" - notified: set = set() + notified: set[tuple[str, str, Optional[str]]] = set() for session_key in active: source = None try: @@ -2212,7 +2303,7 @@ class GatewayRunner: if source is not None: platform_str = source.platform.value - chat_id = source.chat_id + chat_id = str(source.chat_id) thread_id = source.thread_id else: # Fall back to parsing the session key when no persisted @@ -2224,9 +2315,10 @@ class GatewayRunner: chat_id = _parsed["chat_id"] thread_id = _parsed.get("thread_id") - # Deduplicate: one notification per chat, even if multiple - # sessions (different users/threads) share the same chat. - dedup_key = (platform_str, chat_id) + # Deduplicate only identical delivery targets. Thread/topic-aware + # platforms can share a parent chat while still routing to distinct + # destinations via metadata. + dedup_key = (platform_str, chat_id, str(thread_id) if thread_id else None) if dedup_key in notified: continue @@ -2240,10 +2332,19 @@ class GatewayRunner: # correct forum topic / thread. metadata = {"thread_id": thread_id} if thread_id else None - await adapter.send(chat_id, msg, metadata=metadata) + result = await adapter.send(chat_id, msg, metadata=metadata) + if result is not None and getattr(result, "success", True) is False: + logger.debug( + "Failed to send shutdown notification to %s:%s: %s", + platform_str, + chat_id, + getattr(result, "error", "send returned success=False"), + ) + continue + notified.add(dedup_key) logger.info( - "Sent shutdown notification to %s:%s", + "Sent shutdown notification to active chat %s:%s", platform_str, chat_id, ) except Exception as e: @@ -2252,6 +2353,44 @@ class GatewayRunner: platform_str, chat_id, e, ) + for platform, adapter in self.adapters.items(): + home = self.config.get_home_channel(platform) + if not home or not home.chat_id: + continue + + dedup_key = (platform.value, str(home.chat_id), str(home.thread_id) if home.thread_id else None) + if dedup_key in notified: + continue + + try: + metadata = {"thread_id": home.thread_id} if home.thread_id else None + if metadata: + result = await adapter.send(str(home.chat_id), msg, metadata=metadata) + else: + result = await adapter.send(str(home.chat_id), msg) + if result is not None and getattr(result, "success", True) is False: + logger.debug( + "Failed to send shutdown notification to home channel %s:%s: %s", + platform.value, + home.chat_id, + getattr(result, "error", "send returned success=False"), + ) + continue + + notified.add(dedup_key) + logger.info( + "Sent shutdown notification to home channel %s:%s", + platform.value, + home.chat_id, + ) + except Exception as e: + logger.debug( + "Failed to send shutdown notification to home channel %s:%s: %s", + platform.value, + home.chat_id, + e, + ) + def _finalize_shutdown_agents(self, active_agents: Dict[str, Any]) -> None: for agent in active_agents.values(): try: @@ -2519,6 +2658,18 @@ class GatewayRunner: """ logger.info("Starting Hermes Gateway...") logger.info("Session storage: %s", self.config.sessions_dir) + # Log the resolved max_iterations budget so operators can verify the + # config.yaml โ†’ env bridge did the right thing at a glance (instead + # of silently running at a stale .env value for weeks). + try: + _effective_max_iter = int(os.getenv("HERMES_MAX_ITERATIONS", "90")) + logger.info( + "Agent budget: max_iterations=%d (agent.max_turns from config.yaml, " + "or HERMES_MAX_ITERATIONS from .env, or default 90)", + _effective_max_iter, + ) + except Exception: + pass try: from hermes_cli.profiles import get_active_profile_name _profile = get_active_profile_name() @@ -2662,7 +2813,7 @@ class GatewayRunner: try: suspended = self.session_store.suspend_recently_active() if suspended: - logger.info("Suspended %d in-flight session(s) from previous run", suspended) + logger.info("Marked %d in-flight session(s) as resumable from previous run", suspended) except Exception as e: logger.warning("Session suspension on startup failed: %s", e) @@ -2860,8 +3011,28 @@ class GatewayRunner: ): self._schedule_update_notification_watch() + # Give freshly connected platform adapters a brief moment to settle + # before sending restart/startup lifecycle messages. In practice this + # helps Discord thread deliveries right after reconnect. + if connected_count > 0: + await asyncio.sleep(1.0) + # Notify the chat that initiated /restart that the gateway is back. - await self._send_restart_notification() + restart_notification_pending = _restart_notification_pending() + delivered_restart_target = await self._send_restart_notification() + + # Broadcast a lightweight "gateway is back" message to configured + # home channels only when this startup is resuming from /restart. If a + # /restart requester already received a direct completion notice in the + # same chat, skip the generic broadcast there to avoid duplicates while + # still allowing a home-channel fallback when the direct send fails. + if restart_notification_pending or delivered_restart_target is not None: + skip_home_targets = ( + {delivered_restart_target} if delivered_restart_target else None + ) + await self._send_home_channel_startup_notifications( + skip_targets=skip_home_targets, + ) # Drain any recovered process watchers (from crash recovery checkpoint) try: @@ -3889,7 +4060,9 @@ class GatewayRunner: if not check_discord_requirements(): logger.warning("Discord: discord.py not installed") return None - return DiscordAdapter(config) + adapter = DiscordAdapter(config) + adapter.gateway_runner = self # For cross-platform admin alerts on unauthorized slash + return adapter elif platform == Platform.WHATSAPP: from gateway.platforms.whatsapp import WhatsAppAdapter, check_whatsapp_requirements @@ -7792,24 +7965,33 @@ class GatewayRunner: msg = decision.get("message") or "" # Send the status line back to the user so they see the judge's - # verdict. Fire-and-forget via the adapter. + # verdict. Fire-and-forget via the adapter's ``send()`` method โ€” + # adapters expose ``send(chat_id, content, reply_to, metadata)``, + # not a ``send_message(source, msg)`` wrapper, so an earlier + # ``hasattr(adapter, "send_message")`` gate here was dead code and + # users never saw ``โœ“ Goal achieved`` / ``โธ budget exhausted`` + # verdicts. if msg and source is not None: try: adapter = self.adapters.get(source.platform) - if adapter and hasattr(adapter, "send_message"): + if adapter is not None and hasattr(adapter, "send"): import asyncio as _asyncio - coro = adapter.send_message(source, msg) + thread_meta = ( + {"thread_id": source.thread_id} if source.thread_id else None + ) + coro = adapter.send( + chat_id=source.chat_id, + content=msg, + metadata=thread_meta, + ) if _asyncio.iscoroutine(coro): try: - loop = _asyncio.get_event_loop() - if loop.is_running(): - loop.create_task(coro) - else: - loop.run_until_complete(coro) + loop = _asyncio.get_running_loop() + loop.create_task(coro) except RuntimeError: - # No event loop in this thread โ€” schedule on the main one. + # No running loop in this thread โ€” best effort. try: - _asyncio.run_coroutine_threadsafe(coro, self._loop) + _asyncio.run(coro) except Exception: pass except Exception as exc: @@ -7872,14 +8054,33 @@ class GatewayRunner: chat_name = source.chat_name or chat_id env_key = _home_target_env_var(platform_name) + thread_env_key = _home_thread_env_var(platform_name) + thread_id = source.thread_id # Save to .env so it persists across restarts try: from hermes_cli.config import save_env_value save_env_value(env_key, str(chat_id)) + # Keep thread/topic routing explicit and clear stale values when + # /sethome is run from the parent chat instead of a thread. + save_env_value(thread_env_key, str(thread_id or "")) except Exception as e: return f"Failed to save home channel: {e}" + # Keep the running gateway config in sync too. The pre-restart + # notification path reads self.config before the process reloads env. + if source.platform: + platform_config = self.config.platforms.setdefault( + source.platform, + PlatformConfig(enabled=True), + ) + platform_config.home_channel = HomeChannel( + platform=source.platform, + chat_id=str(chat_id), + name=chat_name, + thread_id=str(thread_id) if thread_id else None, + ) + return ( f"โœ… Home channel set to **{chat_name}** (ID: {chat_id}).\n" f"Cron jobs and cross-platform messages will be delivered here." @@ -9623,6 +9824,28 @@ class GatewayRunner: removed = result.get("removed", []) # [{"name", "description"}, ...] total = result.get("total", 0) + # Let each connected adapter refresh any platform-side state + # that cached the skill list at startup. Today that's the + # Discord /skill autocomplete (registered once per connect); + # without this call, new skills stay invisible in the + # dropdown and deleted skills error out when clicked. Other + # adapters that don't override refresh_skill_group (Telegram's + # BotCommand menu, Slack subcommand map, etc.) are silently + # skipped โ€” the in-process reload above is enough for them. + for adapter in list(self.adapters.values()): + refresh = getattr(adapter, "refresh_skill_group", None) + if not callable(refresh): + continue + try: + maybe = refresh() + if inspect.isawaitable(maybe): + await maybe + except Exception as exc: + logger.warning( + "Adapter %s refresh_skill_group raised: %s", + getattr(adapter, "name", adapter), exc, + ) + lines = ["๐Ÿ”„ **Skills Reloaded**\n"] if not added and not removed: lines.append("No new skills detected.") @@ -10341,11 +10564,11 @@ class GatewayRunner: return True - async def _send_restart_notification(self) -> None: + async def _send_restart_notification(self) -> Optional[tuple[str, str, Optional[str]]]: """Notify the chat that initiated /restart that the gateway is back.""" notify_path = _hermes_home / ".restart_notify.json" if not notify_path.exists(): - return + return None try: data = json.loads(notify_path.read_text()) @@ -10354,7 +10577,7 @@ class GatewayRunner: thread_id = data.get("thread_id") if not platform_str or not chat_id: - return + return None platform = Platform(platform_str) adapter = self.adapters.get(platform) @@ -10363,24 +10586,94 @@ class GatewayRunner: "Restart notification skipped: %s adapter not connected", platform_str, ) - return + return None metadata = {"thread_id": thread_id} if thread_id else None - await adapter.send( - chat_id, + result = await adapter.send( + str(chat_id), "โ™ป Gateway restarted successfully. Your session continues.", metadata=metadata, ) + # adapter.send() catches provider errors (e.g. "Chat not found") + # and returns SendResult(success=False) rather than raising, so + # we must inspect the result before claiming success โ€” otherwise + # the log line is misleading and hides real delivery failures. + if result is not None and getattr(result, "success", True) is False: + logger.warning( + "Restart notification to %s:%s was not delivered: %s", + platform_str, + chat_id, + getattr(result, "error", "send returned success=False"), + ) + return None + logger.info( "Sent restart notification to %s:%s", platform_str, chat_id, ) + return str(platform_str), str(chat_id), str(thread_id) if thread_id else None except Exception as e: logger.warning("Restart notification failed: %s", e) + return None finally: notify_path.unlink(missing_ok=True) + async def _send_home_channel_startup_notifications( + self, + *, + skip_targets: Optional[set[tuple[str, str, Optional[str]]]] = None, + ) -> set[tuple[str, str, Optional[str]]]: + """Notify configured home channels that the gateway is back online. + + The notification is best-effort and sent once per connected platform + home channel. ``skip_targets`` lets startup avoid duplicate messages + when a more specific restart notification is queued for the same chat. + """ + delivered: set[tuple[str, str, Optional[str]]] = set() + skipped = skip_targets or set() + message = "โ™ป๏ธ Gateway online โ€” Hermes is back and ready." + + for platform, adapter in self.adapters.items(): + home = self.config.get_home_channel(platform) + if not home or not home.chat_id: + continue + + target = (platform.value, str(home.chat_id), str(home.thread_id) if home.thread_id else None) + if target in skipped or target in delivered: + continue + + try: + metadata = {"thread_id": home.thread_id} if home.thread_id else None + if metadata: + result = await adapter.send(str(home.chat_id), message, metadata=metadata) + else: + result = await adapter.send(str(home.chat_id), message) + if result is not None and getattr(result, "success", True) is False: + logger.warning( + "Home-channel startup notification failed for %s:%s: %s", + platform.value, + home.chat_id, + getattr(result, "error", "send returned success=False"), + ) + continue + + delivered.add(target) + logger.info( + "Sent home-channel startup notification to %s:%s", + platform.value, + home.chat_id, + ) + except Exception as exc: + logger.warning( + "Home-channel startup notification failed for %s:%s: %s", + platform.value, + home.chat_id, + exc, + ) + + return delivered + def _set_session_env(self, context: SessionContext) -> list: """Set session context variables for the current async task. diff --git a/gateway/session.py b/gateway/session.py index fcff336afa..3129f7a325 100644 --- a/gateway/session.py +++ b/gateway/session.py @@ -1086,19 +1086,22 @@ class SessionStore: return len(removed_keys) def suspend_recently_active(self, max_age_seconds: int = 120) -> int: - """Mark recently-active sessions as suspended. + """Mark recently-active sessions as resumable after an unexpected exit. - Called on gateway startup to prevent sessions that were likely - in-flight when the gateway last exited from being blindly resumed - (#7536). Only suspends sessions updated within *max_age_seconds* - to avoid resetting long-idle sessions that are harmless to resume. - Returns the number of sessions that were suspended. + Called on gateway startup after a crash or fast restart to preserve + in-flight sessions instead of destroying their conversation history + (#7536). Only marks sessions updated within *max_age_seconds* to + avoid touching long-idle sessions. Sets ``resume_pending=True`` so + the next incoming message on the same session_key auto-resumes from + the existing transcript. - Entries flagged ``resume_pending=True`` are skipped โ€” those were - marked intentionally by the drain-timeout path as recoverable. - Terminal escalation for genuinely stuck ``resume_pending`` sessions - is handled by the existing ``.restart_failure_counts`` stuck-loop - counter, which runs after this method on startup. + Entries already flagged ``resume_pending=True`` are skipped. Entries + explicitly ``suspended=True`` (from /stop or stuck-loop escalation) + are also skipped. Terminal escalation for genuinely stuck sessions + is still handled by the existing ``.restart_failure_counts`` counter + (threshold 3), which runs after this method and sets ``suspended=True``. + + Returns the number of sessions marked resumable. """ from datetime import timedelta @@ -1110,7 +1113,9 @@ class SessionStore: if entry.resume_pending: continue if not entry.suspended and entry.updated_at >= cutoff: - entry.suspended = True + entry.resume_pending = True + entry.resume_reason = "restart_interrupted" + entry.last_resume_marked_at = _now() count += 1 if count: self._save() diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index 41b1dad500..07e7273bf7 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -10,6 +10,7 @@ To add an alias: set ``aliases=("short",)`` on the existing ``CommandDef``. from __future__ import annotations +import logging import os import re import shutil @@ -21,6 +22,8 @@ from typing import Any from utils import is_truthy_value +logger = logging.getLogger(__name__) + # prompt_toolkit is an optional CLI dependency โ€” only needed for # SlashCommandCompleter and SlashCommandAutoSuggest. Gateway and test # environments that lack it must still be able to import this module @@ -499,9 +502,9 @@ def _sanitize_telegram_name(raw: str) -> str: def _clamp_command_names( - entries: list[tuple[str, str]], + entries: list[tuple[str, ...]], reserved: set[str], -) -> list[tuple[str, str]]: +) -> list[tuple[str, ...]]: """Enforce 32-char command name limit with collision avoidance. Both Telegram and Discord cap slash command names at 32 characters. @@ -509,10 +512,15 @@ def _clamp_command_names( (against *reserved* names or earlier entries in the same batch), the name is shortened to 31 chars and a digit ``0``-``9`` is appended to differentiate. If all 10 digit slots are taken the entry is silently dropped. + + Accepts tuples of any length >= 2. Extra elements beyond ``(name, desc)`` + (e.g. ``cmd_key``) are passed through unchanged, so callers can attach + metadata that survives the rename. """ used: set[str] = set(reserved) - result: list[tuple[str, str]] = [] - for name, desc in entries: + result: list[tuple] = [] + for entry in entries: + name, desc, *extra = entry if len(name) > _CMD_NAME_LIMIT: candidate = name[:_CMD_NAME_LIMIT] if candidate in used: @@ -528,7 +536,7 @@ def _clamp_command_names( if name in used: continue used.add(name) - result.append((name, desc)) + result.append((name, desc, *extra)) return result @@ -611,13 +619,26 @@ def _collect_gateway_skill_entries( try: from agent.skill_commands import get_skill_commands from tools.skills_tool import SKILLS_DIR + from agent.skill_utils import get_external_skills_dirs _skills_dir = str(SKILLS_DIR.resolve()) - _hub_dir = str((SKILLS_DIR / ".hub").resolve()) + _hub_dir = str((SKILLS_DIR / ".hub").resolve()).rstrip("/") + "/" + # Build set of allowed directory prefixes: local skills dir + any + # user-configured ``skills.external_dirs``. Ensure each prefix ends + # with ``/`` so ``/my-skills`` does not also match ``/my-skills-extra``. + # Without this widening, external skills are visible in + # ``hermes skills list`` and the agent's ``/skill-name`` dispatch but + # silently excluded from gateway slash menus (#8110). + _allowed_prefixes = [_skills_dir.rstrip("/") + "/"] + _allowed_prefixes.extend( + str(d).rstrip("/") + "/" for d in get_external_skills_dirs() + ) skill_cmds = get_skill_commands() for cmd_key in sorted(skill_cmds): info = skill_cmds[cmd_key] skill_path = info.get("skill_md_path", "") - if not skill_path.startswith(_skills_dir): + if not skill_path: + continue + if not any(skill_path.startswith(prefix) for prefix in _allowed_prefixes): continue if skill_path.startswith(_hub_dir): continue @@ -635,17 +656,15 @@ def _collect_gateway_skill_entries( except Exception: pass - # Clamp names; _clamp_command_names works on (name, desc) pairs so we - # need to zip/unzip. - skill_pairs = [(n, d) for n, d, _ in skill_triples] - key_by_pair = {(n, d): k for n, d, k in skill_triples} - skill_pairs = _clamp_command_names(skill_pairs, reserved_names) + # Clamp names; cmd_key is passed through as extra payload so it survives + # any clamp-induced renames. + skill_triples = _clamp_command_names(skill_triples, reserved_names) # Skills fill remaining slots โ€” only tier that gets trimmed remaining = max(0, max_slots - len(all_entries)) - hidden_count = max(0, len(skill_pairs) - remaining) - for n, d in skill_pairs[:remaining]: - all_entries.append((n, d, key_by_pair.get((n, d), ""))) + hidden_count = max(0, len(skill_triples) - remaining) + for n, d, k in skill_triples[:remaining]: + all_entries.append((n, d, k)) return all_entries[:max_slots], hidden_count @@ -721,24 +740,40 @@ def discord_skill_commands( def discord_skill_commands_by_category( reserved_names: set[str], ) -> tuple[dict[str, list[tuple[str, str, str]]], list[tuple[str, str, str]], int]: - """Return skill entries organized by category for Discord ``/skill`` subcommand groups. + """Return skill entries organized by category for Discord ``/skill`` autocomplete. - Skills whose directory is nested at least 2 levels under ``SKILLS_DIR`` + Skills whose directory is nested at least 2 levels under a scan root (e.g. ``creative/ascii-art/SKILL.md``) are grouped by their top-level category. Root-level skills (e.g. ``dogfood/SKILL.md``) are returned as - *uncategorized* โ€” the caller should register them as direct subcommands - of the ``/skill`` group. + *uncategorized*. - The same filtering as :func:`discord_skill_commands` is applied: hub - skills excluded, per-platform disabled excluded, names clamped. + Scan roots include the local ``SKILLS_DIR`` **and** any configured + ``skills.external_dirs`` โ€” matching the widened filter applied to the + flat ``discord_skill_commands()`` collector in #18741. Without this + parity, external-dir skills are visible via ``hermes skills list`` and + the agent's ``/skill-name`` dispatch but silently absent from Discord's + ``/skill`` autocomplete. + + Filtering mirrors :func:`discord_skill_commands`: hub skills excluded, + per-platform disabled excluded, names clamped to 32 chars, descriptions + clamped to 100 chars. + + The legacy 25-group ร— 25-subcommand caps (from the old nested + ``/skill `` layout) are **not** applied โ€” the live caller + (``_register_skill_group`` in ``gateway/platforms/discord.py``, refactored + in PR #11580) flattens these results and feeds them into a single + autocomplete callback, which scales to thousands of entries without any + per-command payload concerns. ``hidden_count`` is retained in the return + tuple for backward compatibility and still reports skills dropped for + other reasons (32-char clamp collision vs a reserved name). Returns: ``(categories, uncategorized, hidden_count)`` - *categories*: ``{category_name: [(name, description, cmd_key), ...]}`` - *uncategorized*: ``[(name, description, cmd_key), ...]`` - - *hidden_count*: skills dropped due to Discord group limits - (25 subcommand groups, 25 subcommands per group) + - *hidden_count*: skills dropped due to name clamp collisions + against already-registered command names. """ from pathlib import Path as _P @@ -752,14 +787,33 @@ def discord_skill_commands_by_category( # Collect raw skill data -------------------------------------------------- categories: dict[str, list[tuple[str, str, str]]] = {} uncategorized: list[tuple[str, str, str]] = [] - _names_used: set[str] = set(reserved_names) + # Map clamped-32-char-name โ†’ what it came from, so we can emit an + # actionable warning on collision. Reserved (gateway-builtin) command + # names are marked with a sentinel so the warning distinguishes + # "skill collided with a reserved command" from "two skills collided + # on the 32-char clamp" โ€” the latter is the rename-worthy case. + _names_used: dict[str, str] = {n: "" for n in reserved_names} hidden = 0 try: from agent.skill_commands import get_skill_commands + from agent.skill_utils import get_external_skills_dirs from tools.skills_tool import SKILLS_DIR + _skills_dir = SKILLS_DIR.resolve() _hub_dir = (SKILLS_DIR / ".hub").resolve() + # Build list of (resolved_root, is_local) tuples. Each external dir + # becomes its own scan root for category derivation โ€” a skill at + # ``/mlops/foo/SKILL.md`` is still categorized as "mlops". + _scan_roots: list[_P] = [_skills_dir] + try: + for ext in get_external_skills_dirs(): + try: + _scan_roots.append(_P(ext).resolve()) + except Exception: + continue + except Exception: + pass skill_cmds = get_skill_commands() for cmd_key in sorted(skill_cmds): @@ -768,33 +822,72 @@ def discord_skill_commands_by_category( if not skill_path: continue sp = _P(skill_path).resolve() - # Skip skills outside SKILLS_DIR or from the hub - if not str(sp).startswith(str(_skills_dir)): - continue + # Hub skills are loaded via the skill hub, not surfaced as + # slash commands. if str(sp).startswith(str(_hub_dir)): continue + # Accept skill if it lives under any scan root; record the + # matching root so we can derive the category correctly. + matched_root: _P | None = None + for root in _scan_roots: + try: + sp.relative_to(root) + except ValueError: + continue + matched_root = root + break + if matched_root is None: + continue skill_name = info.get("name", "") if skill_name in _platform_disabled: continue raw_name = cmd_key.lstrip("/") - # Clamp to 32 chars (Discord limit) + # Clamp to 32 chars (Discord per-command name limit) discord_name = raw_name[:32] if discord_name in _names_used: + # Two skills whose first 32 chars are identical. One wins + # (the first one seen, which is alphabetical because the + # caller iterates ``sorted(skill_cmds)``); the other is + # dropped from Discord's /skill autocomplete. + # + # Silently counting this as ``hidden`` (the old behavior) + # meant skill authors had no way to discover the drop โ€” + # their skill just didn't appear in the picker. Emit a + # WARNING naming both sides so the author can rename the + # losing skill's frontmatter name to something with a + # distinct 32-char prefix. + prior = _names_used[discord_name] + if prior == "": + logger.warning( + "Discord /skill: %r (from %r) collides on its 32-char " + "clamp with a reserved gateway command name %r โ€” the " + "skill will not appear in the /skill autocomplete. " + "Rename the skill's frontmatter ``name:`` to differ " + "in its first 32 chars.", + discord_name, cmd_key, discord_name, + ) + else: + logger.warning( + "Discord /skill: %r and %r both clamp to %r on " + "Discord's 32-char command-name limit โ€” only %r " + "will appear in the /skill autocomplete. Rename " + "one skill's frontmatter ``name:`` to differ in " + "its first 32 chars.", + prior, cmd_key, discord_name, prior, + ) + hidden += 1 continue - _names_used.add(discord_name) + _names_used[discord_name] = cmd_key desc = info.get("description", "") if len(desc) > 100: desc = desc[:97] + "..." - # Determine category from the relative path within SKILLS_DIR. - # e.g. creative/ascii-art/SKILL.md โ†’ parts = ("creative", "ascii-art") - try: - rel = sp.parent.relative_to(_skills_dir) - except ValueError: - continue + # Determine category from the relative path within the matched + # scan root. e.g. creative/ascii-art/SKILL.md โ†’ ("creative", ...) + rel = sp.parent.relative_to(matched_root) parts = rel.parts if len(parts) >= 2: cat = parts[0] @@ -804,28 +897,7 @@ def discord_skill_commands_by_category( except Exception: pass - # Enforce Discord limits: 25 subcommand groups, 25 subcommands each ------ - _MAX_GROUPS = 25 - _MAX_PER_GROUP = 25 - - trimmed_categories: dict[str, list[tuple[str, str, str]]] = {} - group_count = 0 - for cat in sorted(categories): - if group_count >= _MAX_GROUPS: - hidden += len(categories[cat]) - continue - entries = categories[cat][:_MAX_PER_GROUP] - hidden += max(0, len(categories[cat]) - _MAX_PER_GROUP) - trimmed_categories[cat] = entries - group_count += 1 - - # Uncategorized skills also count against the 25 top-level limit - remaining_slots = _MAX_GROUPS - group_count - if len(uncategorized) > remaining_slots: - hidden += len(uncategorized) - remaining_slots - uncategorized = uncategorized[:remaining_slots] - - return trimmed_categories, uncategorized, hidden + return categories, uncategorized, hidden # --------------------------------------------------------------------------- diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 57f3e7012a..672aa6ae26 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -400,7 +400,12 @@ DEFAULT_CONFIG = { # The gateway stops accepting new work, waits for running agents # to finish, then interrupts any remaining runs after the timeout. # 0 = no drain, interrupt immediately. - "restart_drain_timeout": 60, + # + # 180s is calibrated for realistic in-flight agent turns: a typical + # coding conversation mid-reasoning runs 60โ€“150s per call, so a 60s + # budget routinely interrupted legitimate work on /restart. Raise + # further in config.yaml if you run very-long-reasoning models. + "restart_drain_timeout": 180, # Max app-level retry attempts for API errors (connection drops, # provider timeouts, 5xx, etc.) before the agent surfaces the # failure. The OpenAI SDK already does its own low-level retries @@ -639,6 +644,18 @@ DEFAULT_CONFIG = { "cache_ttl": "5m", }, + # OpenRouter-specific settings. + # response_cache: enable OpenRouter response caching (X-OpenRouter-Cache header). + # When enabled, identical requests return cached responses for free (zero billing). + # This is separate from Anthropic prompt caching and works alongside it. + # See: https://openrouter.ai/docs/guides/features/response-caching + # response_cache_ttl: how long cached responses remain valid, in seconds (1-86400). + # Default 300 (5 minutes). Only used when response_cache is enabled. + "openrouter": { + "response_cache": True, + "response_cache_ttl": 300, + }, + # AWS Bedrock provider configuration. # Only used when model.provider is "bedrock". "bedrock": { @@ -825,7 +842,7 @@ DEFAULT_CONFIG = { # Voices: alloy, echo, fable, onyx, nova, shimmer }, "xai": { - "voice_id": "eve", + "voice_id": "eve", # or custom voice ID โ€” see https://docs.x.ai/developers/model-capabilities/audio/custom-voices "language": "en", "sample_rate": 24000, "bit_rate": 128000, diff --git a/hermes_cli/curator.py b/hermes_cli/curator.py index b6646d7299..df69aa7d5d 100644 --- a/hermes_cli/curator.py +++ b/hermes_cli/curator.py @@ -302,9 +302,21 @@ def _cmd_rollback(args) -> int: print(f" reason: {manifest.get('reason', '?')}") print(f" created_at: {manifest.get('created_at', '?')}") print(f" skill files: {manifest.get('skill_files', '?')}") + cron = manifest.get("cron_jobs") or {} + if isinstance(cron, dict): + if cron.get("backed_up"): + print( + f" cron jobs: {cron.get('jobs_count', 0)} " + f"(will be restored for skill-link fields only)" + ) + else: + reason = cron.get("reason", "not captured") + print(f" cron jobs: not in snapshot ({reason})") print( "\nThis will replace the current ~/.hermes/skills/ tree (a safety " - "snapshot of the current state is taken first so this is undoable)." + "snapshot of the current state is taken first so this is undoable). " + "Cron jobs that still exist will have their skills/skill fields " + "restored from the snapshot; all other cron fields are left alone." ) if not getattr(args, "yes", False): diff --git a/hermes_cli/doctor.py b/hermes_cli/doctor.py index f0822bdce8..122ed141cc 100644 --- a/hermes_cli/doctor.py +++ b/hermes_cli/doctor.py @@ -263,8 +263,11 @@ def run_doctor(args): if env_path.exists(): check_ok(f"{_DHH}/.env file exists") - # Check for common issues - content = env_path.read_text() + # Check for common issues. Pin encoding to UTF-8 because .env files are + # written as UTF-8 everywhere in the codebase, while Path.read_text() + # defaults to the system locale โ€” which crashes on non-UTF-8 Windows + # locales (e.g. GBK) as soon as the file contains any non-ASCII byte. + content = env_path.read_text(encoding="utf-8") if _has_provider_env_config(content): check_ok("API key or custom endpoint configured") else: diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 5821a40c78..a12999b77e 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -289,7 +289,7 @@ def _has_any_provider_configured() -> bool: env_file = get_env_path() if env_file.exists(): try: - for line in env_file.read_text().splitlines(): + for line in env_file.read_text(encoding="utf-8").splitlines(): line = line.strip() if line.startswith("#") or "=" not in line: continue diff --git a/hermes_cli/memory_setup.py b/hermes_cli/memory_setup.py index 88186b8ec6..158f80a766 100644 --- a/hermes_cli/memory_setup.py +++ b/hermes_cli/memory_setup.py @@ -361,7 +361,7 @@ def _write_env_vars(env_path: Path, env_writes: dict) -> None: existing_lines = [] if env_path.exists(): - existing_lines = env_path.read_text().splitlines() + existing_lines = env_path.read_text(encoding="utf-8").splitlines() updated_keys = set() new_lines = [] diff --git a/hermes_cli/model_switch.py b/hermes_cli/model_switch.py index 07455eb6fa..4c323145da 100644 --- a/hermes_cli/model_switch.py +++ b/hermes_cli/model_switch.py @@ -1057,6 +1057,45 @@ def list_authenticated_providers( if normed: _builtin_endpoints.add(normed) + def _has_fast_aws_sdk_signal() -> bool: + """Return True when explicit AWS auth config is present. + + This intentionally avoids botocore's full credential chain. Provider + picker/model-switch discovery can run for non-Bedrock providers, and + botocore may otherwise probe EC2 IMDS (169.254.169.254) on local + machines before returning no credentials. + """ + if os.environ.get("AWS_BEARER_TOKEN_BEDROCK", "").strip(): + return True + if ( + os.environ.get("AWS_ACCESS_KEY_ID", "").strip() + and os.environ.get("AWS_SECRET_ACCESS_KEY", "").strip() + ): + return True + return any( + os.environ.get(name, "").strip() + for name in ( + "AWS_PROFILE", + "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI", + "AWS_CONTAINER_CREDENTIALS_FULL_URI", + "AWS_WEB_IDENTITY_TOKEN_FILE", + ) + ) + + def _has_aws_sdk_creds_for_listing(slug: str) -> bool: + """Credential check for AWS SDK providers in non-runtime discovery.""" + slug_norm = str(slug or "").strip().lower() + current_norm = str(current_provider or "").strip().lower() + if _has_fast_aws_sdk_signal(): + return True + if slug_norm != current_norm: + return False + try: + from agent.bedrock_adapter import has_aws_credentials + return bool(has_aws_credentials()) + except Exception: + return False + data = fetch_models_dev() # Build curated model lists keyed by hermes provider ID @@ -1184,7 +1223,9 @@ def list_authenticated_providers( # Check if credentials exist has_creds = False - if overlay.extra_env_vars: + if overlay.auth_type == "aws_sdk": + has_creds = _has_aws_sdk_creds_for_listing(hermes_slug) + elif overlay.extra_env_vars: has_creds = any(os.environ.get(ev) for ev in overlay.extra_env_vars) # Also check api_key_env_vars from PROVIDER_REGISTRY for api_key auth_type if not has_creds and overlay.auth_type == "api_key": @@ -1324,11 +1365,7 @@ def list_authenticated_providers( # credentials come from the boto3 credential chain (env vars, # ~/.aws/credentials, instance roles, etc.) if not _cp_has_creds and _cp_config and getattr(_cp_config, "auth_type", "") == "aws_sdk": - try: - from agent.bedrock_adapter import has_aws_credentials - _cp_has_creds = has_aws_credentials() - except Exception: - pass + _cp_has_creds = _has_aws_sdk_creds_for_listing(_cp.slug) if not _cp_has_creds: continue diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 3933ad8494..31cb846012 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -1190,6 +1190,13 @@ def _setup_tts_provider(config: dict): "Falling back to Edge TTS." ) selected = "edge" + if selected == "xai": + print() + voice_id = prompt("xAI voice_id (Enter for 'eve', or paste a custom voice ID)") + if voice_id and voice_id.strip(): + config.setdefault("tts", {}).setdefault("xai", {})["voice_id"] = voice_id.strip() + print_success(f"xAI voice_id set to: {voice_id.strip()}") + elif selected == "minimax": existing = get_env_value("MINIMAX_API_KEY") @@ -1643,7 +1650,11 @@ def setup_terminal_backend(config: dict): def _apply_default_agent_settings(config: dict): """Apply recommended defaults for all agent settings without prompting.""" config.setdefault("agent", {})["max_turns"] = 90 - save_env_value("HERMES_MAX_ITERATIONS", "90") + # config.yaml is the authoritative source for max_turns; the gateway + # bridges it into HERMES_MAX_ITERATIONS at startup. We no longer write + # to .env to avoid the dual-source inconsistency that caused the + # 60-vs-500 bug (stale .env entry silently shadowing config.yaml). + remove_env_value("HERMES_MAX_ITERATIONS") config.setdefault("display", {})["tool_progress"] = "all" @@ -1673,9 +1684,10 @@ def setup_agent_settings(config: dict): print() # โ”€โ”€ Max Iterations โ”€โ”€ - current_max = get_env_value("HERMES_MAX_ITERATIONS") or str( - cfg_get(config, "agent", "max_turns", default=90) - ) + # config.yaml is authoritative; read from there. If a legacy .env + # entry is still around (from pre-PR#18413 setups), prefer the + # config value so we don't surface a stale number to the user. + current_max = str(cfg_get(config, "agent", "max_turns", default=90)) print_info("Maximum tool-calling iterations per conversation.") print_info("Higher = more complex tasks, but costs more tokens.") print_info( @@ -1686,9 +1698,13 @@ def setup_agent_settings(config: dict): try: max_iter = int(max_iter_str) if max_iter > 0: - save_env_value("HERMES_MAX_ITERATIONS", str(max_iter)) + # Write to config.yaml (authoritative) only. Also clean up any + # stale .env entry from earlier setup runs โ€” the gateway's + # bridge in gateway/run.py now unconditionally derives + # HERMES_MAX_ITERATIONS from agent.max_turns at startup. config.setdefault("agent", {})["max_turns"] = max_iter config.pop("max_turns", None) + remove_env_value("HERMES_MAX_ITERATIONS") print_success(f"Max iterations set to {max_iter}") except ValueError: print_warning("Invalid number, keeping current value") diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index 5edb227d95..b3df18d932 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -1822,7 +1822,7 @@ def _reconfigure_tool(config: dict): cat = TOOL_CATEGORIES.get(ts_key) reqs = TOOLSET_ENV_REQUIREMENTS.get(ts_key) if cat or reqs: - if _toolset_has_keys(ts_key, config): + if _toolset_has_keys(ts_key, config) or _toolset_enabled_for_reconfigure(ts_key, config): configurable.append((ts_key, ts_label)) if not configurable: @@ -1848,6 +1848,28 @@ def _reconfigure_tool(config: dict): save_config(config) +def _toolset_enabled_for_reconfigure(ts_key: str, config: dict) -> bool: + """Return True if a configurable toolset is enabled anywhere. + + Reconfigure must include enabled-but-unconfigured categories so users can + finish provider/API-key setup without disabling and re-enabling the toolset. + """ + for platform in PLATFORMS: + if not _toolset_allowed_for_platform(ts_key, platform): + continue + try: + enabled = _get_platform_tools( + config, + platform, + include_default_mcp_servers=False, + ) + except Exception: + continue + if ts_key in enabled: + return True + return False + + def _configure_tool_category_for_reconfig(ts_key: str, cat: dict, config: dict): """Reconfigure a tool category - provider selection + API key update.""" icon = cat.get("icon", "") diff --git a/hermes_constants.py b/hermes_constants.py index 35dbf86ab2..e63a4ec301 100644 --- a/hermes_constants.py +++ b/hermes_constants.py @@ -8,14 +8,64 @@ import os from pathlib import Path +_profile_fallback_warned: bool = False + + def get_hermes_home() -> Path: """Return the Hermes home directory (default: ~/.hermes). Reads HERMES_HOME env var, falls back to ~/.hermes. This is the single source of truth โ€” all other copies should import this. + + When ``HERMES_HOME`` is unset but an ``active_profile`` file indicates + a non-default profile is active, logs a loud one-shot warning to + ``errors.log`` so cross-profile data corruption is diagnosable instead + of silent. Behavior is unchanged otherwise โ€” we still return + ``~/.hermes`` โ€” because raising here would brick 30+ module-level + callers that import this at load time. Subprocess spawners are + expected to propagate ``HERMES_HOME`` explicitly (see the systemd + template in ``hermes_cli/gateway.py`` and the kanban dispatcher in + ``hermes_cli/kanban_db.py``). See https://github.com/NousResearch/hermes-agent/issues/18594. """ val = os.environ.get("HERMES_HOME", "").strip() - return Path(val) if val else Path.home() / ".hermes" + if val: + return Path(val) + + # Guard: if a non-default profile is sticky-active, warn once that + # the fallback to the default profile is almost certainly wrong. + global _profile_fallback_warned + if not _profile_fallback_warned: + try: + # Inline the default-root resolution from get_default_hermes_root() + # to stay import-safe (this function is called from module scope + # in 30+ files; we cannot afford to trigger logging setup here). + active_path = (Path.home() / ".hermes" / "active_profile") + active = active_path.read_text().strip() if active_path.exists() else "" + except (UnicodeDecodeError, OSError): + active = "" + if active and active != "default": + _profile_fallback_warned = True + # Write directly to stderr. We intentionally do NOT route this + # through ``logging`` because (a) this function is called at + # module-import time from 30+ sites, often before logging is + # configured, and (b) root-logger propagation would double-emit + # on consoles where a StreamHandler is already attached. + import sys + msg = ( + f"[HERMES_HOME fallback] HERMES_HOME is unset but active " + f"profile is {active!r}. Falling back to ~/.hermes, which " + f"is the DEFAULT profile โ€” not {active!r}. Any data this " + f"process writes will land in the wrong profile. The " + f"subprocess spawner should pass HERMES_HOME explicitly " + f"(see issue #18594)." + ) + try: + sys.stderr.write(msg + "\n") + sys.stderr.flush() + except Exception: + pass + + return Path.home() / ".hermes" def get_default_hermes_root() -> Path: diff --git a/optional-skills/creative/kanban-video-orchestrator/SKILL.md b/optional-skills/creative/kanban-video-orchestrator/SKILL.md new file mode 100644 index 0000000000..114e774ff6 --- /dev/null +++ b/optional-skills/creative/kanban-video-orchestrator/SKILL.md @@ -0,0 +1,206 @@ +--- +name: kanban-video-orchestrator +description: Plan, set up, and monitor a multi-agent video production pipeline backed by Hermes Kanban. Use when the user wants to make ANY video โ€” narrative film, product/marketing, music video, explainer, ASCII/terminal art, abstract/generative loop, comic, 3D, real-time/installation โ€” and the work warrants decomposition into specialized profiles (writer, designer, animator, renderer, voice, editor, etc.) coordinated through a kanban board. Performs adaptive discovery to scope the brief, designs an appropriate team for the requested style, generates the setup script that creates Hermes profiles + initial kanban task, then helps monitor execution and intervene when tasks stall or fail. Routes scenes to whichever Hermes rendering / audio / design skill fits each beat (`ascii-video`, `manim-video`, `p5js`, `comfyui`, `touchdesigner-mcp`, `blender-mcp`, `pixel-art`, `baoyu-comic`, `claude-design`, `excalidraw`, `songsee`, `heartmula`, โ€ฆ) plus external APIs for TTS, image-gen, and image-to-video as needed. +version: 1.0.0 +author: [SHL0MS, alt-glitch] +license: MIT +metadata: + hermes: + tags: [video, kanban, multi-agent, orchestration, production-pipeline] + related_skills: [kanban-orchestrator, kanban-worker, ascii-video, manim-video, p5js, comfyui, touchdesigner-mcp, blender-mcp, pixel-art, ascii-art, songwriting-and-ai-music, heartmula, songsee, spotify, youtube-content, claude-design, excalidraw, architecture-diagram, concept-diagrams, baoyu-comic, baoyu-infographic, humanizer, gif-search, meme-generation] + credits: | + The single-project workspace layout, profile-config patching pattern, + SOUL.md-per-profile model, TEAM.md task-graph convention, and + `--workspace dir:` discipline are adapted from alt-glitch's + original multi-agent video pipeline at + https://github.com/NousResearch/kanban-video-pipeline. +--- + +# Kanban Video Orchestrator + +Wrap any video request โ€” from a 15-second product teaser to a 5-minute narrative +short to a music video to an ASCII loop โ€” in a Hermes Kanban pipeline that +decomposes the work to specialized agent profiles. + +This skill does **not** render anything itself. It is a meta-pipeline that: + +1. **Scopes** the request through targeted discovery +2. **Designs** an appropriate team (which roles, which tools per role) based on the style +3. **Generates** a setup script that creates Hermes profiles, project workspace, and the initial kanban task +4. **Hands off** to the director profile, which decomposes via the kanban +5. **Monitors** execution, helps intervene when tasks stall or fail + +The actual rendering happens inside the kanban once it's running, via whichever +existing skills + tools fit the scenes โ€” `ascii-video`, `manim-video`, `p5js`, +`comfyui`, `touchdesigner-mcp`, `blender-mcp`, `songwriting-and-ai-music`, +`heartmula`, external APIs, or plain Python with PIL + ffmpeg. + +## When NOT to use this skill + +- The video is one continuous procedural project that needs no specialists. Just write the code directly. +- The user wants a quick one-shot conversion (e.g. "convert this mp4 to a GIF") โ€” use ffmpeg directly. +- The output is a static image, GIF, or audio-only artifact โ€” use the matching specific skill (`ascii-art`, `gifs`, `meme-generation`, `songwriting-and-ai-music`). +- The work fits a single existing skill cleanly (e.g. a pure ASCII video โ€” just use `ascii-video`). + +## Workflow + +``` +DISCOVER โ†’ BRIEF โ†’ TEAM DESIGN โ†’ SETUP โ†’ EXECUTE โ†’ MONITOR +``` + +### Step 1 โ€” Discover (ask the right questions) + +The discovery process is **adaptive**: ask only what is actually needed. Always +start with three questions to identify the broad shape: + +- **What is the video?** (one-sentence brief) +- **How long?** (5-30s teaser / 30-90s short / 90s-3min explainer / 3-10min film / longer) +- **What aspect ratio + target platform?** (1:1 / 9:16 / 16:9; X, IG, YouTube, internal, etc.) + +From the answer, classify the style category. The style determines which +follow-up questions to ask. **Do not ask all questions at once.** Ask 2-4 at a +time, listen, then proceed. Make reasonable assumptions whenever the user +implies an answer. + +For complete intake patterns and per-style question banks, see +**[references/intake.md](references/intake.md)**. + +### Step 2 โ€” Brief + +Once enough is known, produce a structured `brief.md` using the template in +`assets/brief.md.tmpl`. Stages: + +1. **Concept** โ€” the one-sentence pitch + emotional north star +2. **Scope** โ€” duration, aspect, platform, deadline +3. **Style** โ€” visual references, brand constraints, tone +4. **Scenes** โ€” beat-by-beat breakdown (durations, content, target tool) +5. **Audio** โ€” narration / music / SFX / silent (per scene if needed) +6. **Deliverables** โ€” file format, resolution, optional alternates (vertical cut, GIF, etc.) + +Show the brief to the user for confirmation before designing the team. **The +brief is the contract** โ€” every downstream task references it. + +### Step 3 โ€” Team design + +Pick role archetypes from the library that fit this video. **Compose, don't +clone.** Most videos need 4-7 profiles. The director is always present; the +rest are picked by what the brief actually requires. + +For the role library and per-style team compositions, see +**[references/role-archetypes.md](references/role-archetypes.md)**. + +For mapping role โ†’ which Hermes skills + toolsets it loads, see +**[references/tool-matrix.md](references/tool-matrix.md)**. + +### Step 4 โ€” Setup + +Generate a setup script (`setup.sh`) and run it. The script: + +1. Creates the project workspace (`~/projects/video-pipeline//`) +2. Copies any provided assets into `taste/`, `audio/`, `assets/` +3. Creates each Hermes profile via `hermes profile create --clone` +4. Writes per-profile `SOUL.md` (personality + role definition) +5. Configures profile YAML (toolsets, always_load skills, cwd) +6. Writes `brief.md`, `TEAM.md`, and `taste/` content +7. Fires the initial `hermes kanban create` task assigned to the director + +Use `scripts/bootstrap_pipeline.py` to generate setup.sh from a brief + +team-design JSON. See **[references/kanban-setup.md](references/kanban-setup.md)** +for the setup script structure, profile config patterns, and the critical +"shared workspace" rule. + +### Step 5 โ€” Execute + +Run `setup.sh`. Then provide the user with monitoring commands: + +```bash +hermes kanban watch --tenant # live events +hermes kanban list --tenant # board snapshot +hermes dashboard # visual board UI +``` + +The director profile takes over from here, decomposing the work and routing +tasks to specialist profiles via the kanban toolset. + +### Step 6 โ€” Monitor and intervene + +Stay engaged โ€” the kanban runs autonomously but a stuck task or bad output +needs human (or AI) judgment. + +Monitoring patterns: poll `kanban list` periodically, inspect any RUNNING task +that exceeds its expected duration with `kanban show `, and check +heartbeats. When a worker's output fails review, the standard interventions are: + +1. Comment on the worker's task with specific feedback (`kanban_comment`) +2. Create a re-run task with the original as parent +3. Adjust the brief's scope and let the director re-decompose + +For diagnostic patterns, intervention recipes, and the "task is stuck" +playbook, see **[references/monitoring.md](references/monitoring.md)**. + +## Reference: worked examples + +Six concrete pipelines covering very different video styles โ€” narrative film, +product/marketing, music video, math/algorithm explainer, ASCII video, real-time +installation โ€” showing how the same workflow yields very different teams and +task graphs. See **[references/examples.md](references/examples.md)**. + +## Critical rules + +1. **Discovery before action.** Never start generating a brief or team without + asking at least the three baseline questions. A bad brief cascades through + the entire pipeline. + +2. **Match the team to the video.** Don't reuse the same 4-profile setup for + every job. A music video that doesn't have a beat-analysis profile will + misfire. A narrative film that doesn't have a writer profile will produce + incoherent scenes. See `references/role-archetypes.md`. + +3. **One workspace per project.** All profiles for a given video share the same + `dir:` workspace. Tasks pass artifacts via shared filesystem and structured + handoffs. **Every** `kanban_create` call passes + `workspace_kind="dir"` + `workspace_path=""`. + +4. **Tenant every project.** Use a project-specific tenant + (`--tenant `). Keeps the dashboard scoped and prevents + cross-pollination with other ongoing kanbans. + +5. **Respect existing skills.** When a scene fits an existing skill, the + relevant renderer should load that skill via `--skill ` on its task + or `always_load` in its profile. Do not re-derive what a skill already + provides. + +6. **The director never executes.** Even with the full `kanban + terminal + + file` toolset, the director's `SOUL.md` rules forbid it from executing + work itself. It decomposes and routes only โ€” every concrete task becomes + a `hermes kanban create` call to a specialist profile. The + `kanban-orchestrator` skill spells this out further. + +7. **Don't over-decompose.** A 30-second product video does NOT need 20 tasks. + Aim for the smallest task graph that still parallelizes well and exposes the + right human-review gates. + +8. **Verify API keys BEFORE firing.** External APIs (TTS, image-gen, + image-to-video) need keys in `~/.hermes/.env` or the user's secret store. + A worker that hits a missing-key error wastes a task slot. The setup + script's `check_key` helper aborts cleanly if a required key is missing. + +## File map + +``` +SKILL.md โ† this file (workflow + rules) +references/ + intake.md โ† discovery question banks per style + role-archetypes.md โ† role library (writer, designer, animator, โ€ฆ) + tool-matrix.md โ† skill + toolset mapping per role + kanban-setup.md โ† setup script structure & profile config + monitoring.md โ† watch + intervene patterns + examples.md โ† six worked pipelines +assets/ + brief.md.tmpl โ† brief skeleton + setup.sh.tmpl โ† setup script skeleton + soul.md.tmpl โ† profile personality skeleton +scripts/ + bootstrap_pipeline.py โ† generate setup.sh from brief + team JSON + monitor.py โ† polling + intervention helpers +``` diff --git a/optional-skills/creative/kanban-video-orchestrator/assets/brief.md.tmpl b/optional-skills/creative/kanban-video-orchestrator/assets/brief.md.tmpl new file mode 100644 index 0000000000..fbe8d8cbfb --- /dev/null +++ b/optional-skills/creative/kanban-video-orchestrator/assets/brief.md.tmpl @@ -0,0 +1,79 @@ +# Video Brief โ€” {{TITLE}} + +> Slug: `{{SLUG}}` ยท Tenant: `{{TENANT}}` ยท Project workspace: `{{WORKSPACE}}` + +## 1. Concept + +**One-line pitch.** {{ONE_LINE_PITCH}} + +**Emotional north star.** {{EMOTIONAL_NORTH_STAR}} +*(What should the viewer feel walking away?)* + +## 2. Scope + +| | | +|---|---| +| Duration | {{DURATION_S}} seconds | +| Aspect ratio | {{ASPECT}} | +| Resolution | {{RESOLUTION}} | +| Frame rate | {{FPS}} fps | +| Target platforms | {{PLATFORMS}} | +| Deadline | {{DEADLINE}} | +| Quality bar | {{QUALITY_BAR}} *(rough draft / polished / archival)* | + +## 3. Style + +**Visual references.** {{VISUAL_REFS}} + +**Tone.** {{TONE}} + +**Brand constraints.** {{BRAND_CONSTRAINTS}} +*(colors, typography, motion language; or "n/a")* + +**Aesthetic rules.** +{{AESTHETIC_RULES}} + +## 4. Scenes + +Beat-by-beat breakdown. Each scene gets a row. + +| # | Time | Content | Target tool / skill | Audio | Notes | +|---|------|---------|---------------------|-------|-------| +| 1 | 0:00โ€“0:0X | {{SCENE_1_CONTENT}} | {{SCENE_1_TOOL}} | {{SCENE_1_AUDIO}} | {{SCENE_1_NOTES}} | +| 2 | 0:0Xโ€“0:0Y | ... | ... | ... | ... | + +## 5. Audio + +**Approach.** {{AUDIO_APPROACH}} +*(narration / music-only / synced to track / silent / mixed)* + +**Voiceover.** {{VO_DETAILS}} +*(provider, voice, language, script source โ€” "n/a" if no VO)* + +**Music.** {{MUSIC_DETAILS}} +*(provided track path / commission via Suno / commission via heartmula / +license-free / "n/a")* + +**SFX.** {{SFX_DETAILS}} +*(generated, library, or "n/a")* + +## 6. Deliverables + +| Format | Resolution | Notes | +|--------|-----------|-------| +| {{PRIMARY_FORMAT}} | {{PRIMARY_RES}} | The main output | +| {{ALT_FORMAT_1}} | {{ALT_RES_1}} | {{ALT_NOTES_1}} | + +**Final filename.** `output/final.mp4` +*(plus optional `output/final-9x16.mp4`, `output/captions.srt`, etc.)* + +## 7. Constraints + +- API keys required: {{API_KEYS_REQUIRED}} +- External dependencies: {{EXT_DEPS}} +- Source assets to incorporate: {{SOURCE_ASSETS}} + +--- + +**This brief is the contract. The director and every downstream profile read +it. If the brief changes, the kanban must be re-fired โ€” don't edit live.** diff --git a/optional-skills/creative/kanban-video-orchestrator/assets/setup.sh.tmpl b/optional-skills/creative/kanban-video-orchestrator/assets/setup.sh.tmpl new file mode 100644 index 0000000000..01d836def8 --- /dev/null +++ b/optional-skills/creative/kanban-video-orchestrator/assets/setup.sh.tmpl @@ -0,0 +1,185 @@ +#!/usr/bin/env bash +# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• +# Video Pipeline Setup โ€” {{TITLE}} +# +# Generated by kanban-video-orchestrator skill. +# +# Slug: {{SLUG}} +# Workspace: {{WORKSPACE}} +# Tenant: {{TENANT}} +# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• +set -euo pipefail + +PROJECT_SLUG="{{SLUG}}" +WORKSPACE="$HOME/projects/video-pipeline/${PROJECT_SLUG}" +TENANT="{{TENANT}}" + +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +# 1. Verify required API keys +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +echo "โ•โ•โ• Checking required API keys โ•โ•โ•" + +check_key() { + local var="$1" + local kc_account="${2:-hermes}" + local kc_service="${3:-$1}" + if grep -q "^${var}=" "$HOME/.hermes/.env" 2>/dev/null && \ + [ -n "$(grep "^${var}=" "$HOME/.hermes/.env" | cut -d= -f2-)" ]; then + echo " โœ“ ${var} (env)" + return 0 + fi + if command -v security >/dev/null 2>&1 && \ + security find-generic-password -a "${kc_account}" -s "${kc_service}" -w >/dev/null 2>&1; then + echo " โœ“ ${var} (Keychain ${kc_account}/${kc_service})" + return 0 + fi + echo " โœ— ${var} not set in ~/.hermes/.env or Keychain (${kc_account}/${kc_service})" + return 1 +} + +# Customize this list per project โ€” only check keys actually used: +{{KEY_CHECKS}} + +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +# 2. Create project workspace +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +echo "โ•โ•โ• Creating project workspace โ•โ•โ•" +mkdir -p "$WORKSPACE"/{taste,audio/{voiceover,sfx},assets,scenes,checkpoints,tools,output} +{{SCENE_DIRS}} +echo " โœ“ $WORKSPACE" + +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +# 3. Create Hermes profiles +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +echo "โ•โ•โ• Creating Hermes profiles โ•โ•โ•" + +{{PROFILE_CREATE_COMMANDS}} + +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +# 4. Configure profiles (toolsets, skills, cwd) +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +echo "โ•โ•โ• Configuring profiles โ•โ•โ•" + +configure_profile() { + local profile="$1" + local toolsets_json="$2" # JSON array string, e.g. '["kanban","terminal","file"]' + local skills_json="$3" # JSON array string, e.g. '["kanban-worker","ascii-video"]' + python3 - "$profile" "$toolsets_json" "$skills_json" "$WORKSPACE" <<'PY' +"""Patch a Hermes profile config.yaml using PyYAML so we don't depend on the +exact default-config string format. Validates the patch took effect and exits +non-zero if anything's off.""" +import json +import os +import sys + +try: + import yaml +except ImportError: + print("ERROR: PyYAML required. pip install pyyaml", file=sys.stderr) + sys.exit(1) + +profile, toolsets_json, skills_json, workspace = sys.argv[1:5] +toolsets = json.loads(toolsets_json) +skills = json.loads(skills_json) + +p = os.path.expanduser(f"~/.hermes/profiles/{profile}/config.yaml") +if not os.path.exists(p): + print(f" โœ— profile config not found: {p}", file=sys.stderr) + sys.exit(1) + +with open(p) as f: + cfg = yaml.safe_load(f) or {} + +# Apply our changes โ€” only the keys we actually want to set. +cfg["toolsets"] = toolsets +cfg.setdefault("skills", {}) +cfg["skills"]["always_load"] = skills + +# Note: we do NOT touch cfg["approvals"] โ€” that's a security-sensitive +# setting (manual confirmation of tool calls). Workspace cwd is overridden +# per-task by `--workspace dir:` on `hermes kanban create`, so we +# don't need to mutate cfg["terminal"]["cwd"] either. + +with open(p, "w") as f: + yaml.safe_dump(cfg, f, sort_keys=False) + +# Validate +with open(p) as f: + after = yaml.safe_load(f) +errors = [] +if after.get("toolsets") != toolsets: + errors.append(f"toolsets mismatch: {after.get('toolsets')!r}") +if after.get("skills", {}).get("always_load") != skills: + errors.append(f"skills.always_load mismatch: {after.get('skills', {}).get('always_load')!r}") +if errors: + print(f" โœ— {profile}: " + "; ".join(errors), file=sys.stderr) + sys.exit(1) +PY + if [ $? -ne 0 ]; then + echo " โœ— failed to configure ${profile}" >&2 + exit 1 + fi + echo " โœ“ ${profile}" +} + +{{PROFILE_CONFIG_COMMANDS}} + +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +# 5. Write SOUL.md per profile +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +echo "โ•โ•โ• Writing profile personalities โ•โ•โ•" + +{{SOUL_WRITES}} + +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +# 6. Copy brief, TEAM.md, and any provided assets +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +echo "โ•โ•โ• Writing brief + taste โ•โ•โ•" + +cat > "$WORKSPACE/brief.md" <<'BRIEF_EOF' +{{BRIEF_CONTENTS}} +BRIEF_EOF + +cat > "$WORKSPACE/TEAM.md" <<'TEAM_EOF' +{{TEAM_CONTENTS}} +TEAM_EOF + +{{TASTE_WRITES}} + +{{ASSET_COPIES}} + +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +# 7. Fire the initial kanban task +# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +echo "โ•โ•โ• Firing initial kanban task โ•โ•โ•" + +hermes kanban create "Direct production of {{TITLE}}" \ + --assignee director \ + --workspace dir:"$WORKSPACE" \ + --tenant "$TENANT" \ + --priority 2 \ + --max-runtime 4h \ + --body "$(cat < **Credit:** the single-project-workspace layout, profile-config patching +> approach, SOUL.md-per-profile convention, and `--workspace dir:` rule +> are adapted from alt-glitch's original multi-agent video pipeline: +> [NousResearch/kanban-video-pipeline](https://github.com/NousResearch/kanban-video-pipeline). +> This skill generalizes those patterns across video styles and replaces the +> string-replacement config patcher with a PyYAML-based one. + +## Project workspace structure + +Every video project gets one workspace under `~/projects/video-pipeline//`: + +``` +~/projects/video-pipeline// +โ”œโ”€โ”€ brief.md โ† the contract; all tasks reference +โ”œโ”€โ”€ TEAM.md โ† team composition + task graph (director reads this) +โ”œโ”€โ”€ taste/ +โ”‚ โ”œโ”€โ”€ brand-guide.md โ† color, typography, motion rules +โ”‚ โ”œโ”€โ”€ emotional-dna.md โ† what the piece should FEEL like +โ”‚ โ””โ”€โ”€ style-frames/ โ† optional: visual references +โ”œโ”€โ”€ audio/ +โ”‚ โ”œโ”€โ”€ track.mp3 โ† provided music (if any) +โ”‚ โ”œโ”€โ”€ voiceover/ โ† per-line TTS clips +โ”‚ โ””โ”€โ”€ sfx/ โ† sound effects +โ”œโ”€โ”€ assets/ +โ”‚ โ”œโ”€โ”€ logos/ +โ”‚ โ”œโ”€โ”€ fonts/ +โ”‚ โ””โ”€โ”€ existing-footage/ โ† reusable provided clips +โ”œโ”€โ”€ scenes/ +โ”‚ โ”œโ”€โ”€ scene-01/ +โ”‚ โ”‚ โ”œโ”€โ”€ VISUAL_SPEC.md โ† cinematographer's per-scene spec +โ”‚ โ”‚ โ”œโ”€โ”€ render.py โ† renderer's code (or sketch.html, etc.) +โ”‚ โ”‚ โ”œโ”€โ”€ checkpoints/ โ† preview frames for QA +โ”‚ โ”‚ โ””โ”€โ”€ clip.mp4 โ† the deliverable for this scene +โ”‚ โ”œโ”€โ”€ scene-02/... +โ”‚ โ””โ”€โ”€ ... +โ”œโ”€โ”€ checkpoints/ โ† global review frames +โ”œโ”€โ”€ tools/ โ† optional project-local helpers +โ””โ”€โ”€ output/ + โ”œโ”€โ”€ final.mp4 โ† stitched + audio + โ”œโ”€โ”€ final-noaudio.mp4 + โ”œโ”€โ”€ final-9x16.mp4 โ† optional: vertical alternate + โ””โ”€โ”€ captions.srt โ† optional: subtitle file +``` + +**The slug** is derived from the brief title: lowercase, hyphen-separated. +Example: `q3-product-teaser`, `ascii-mood-loop`, `interview-cut-2026-q1`. + +## The setup.sh script + +The setup script does six things in order: + +1. **Create workspace tree** โ€” all directories above +2. **Create profiles** โ€” `hermes profile create --clone` +3. **Configure profiles** โ€” patch each profile's + `~/.hermes/profiles//config.yaml` to set toolsets, always_load skills, + and `cwd` +4. **Write SOUL.md per profile** โ€” the personality + role definition +5. **Copy any provided assets + write `brief.md`, `TEAM.md`, and `taste/`** +6. **Fire the initial kanban task** โ€” `hermes kanban create` assigned to the director + +See `assets/setup.sh.tmpl` for the skeleton. + +### Profile creation pattern + +```bash +hermes profile create director --clone 2>/dev/null || true +``` + +The `--clone` flag clones from the active profile (preserving model, base +config). The `|| true` makes the script idempotent โ€” re-running won't error if +the profile already exists. + +### Profile config patching + +Each profile has a YAML config at `~/.hermes/profiles//config.yaml`. The +setup script edits exactly two keys: + +1. `toolsets:` โ€” replace the default with the role's required toolsets +2. `skills.always_load:` โ€” list the role's must-load skills (may be empty) + +**Do NOT** modify `approvals.mode` (controls user-confirmation of tool calls +โ€” a security setting that must stay as the user configured it). **Do NOT** +modify `terminal.cwd` โ€” the kanban dispatcher overrides cwd per-task via +`--workspace dir:`, so the profile's cwd is irrelevant to the kanban +work and changing it could break the user's interactive use of the profile. + +Use **PyYAML**, not string replacement, so the patch is robust against +default-config schema drift: + +```bash +configure_profile() { + local profile="$1" + local toolsets_json="$2" # JSON array, e.g. '["kanban","terminal","file"]' + local skills_json="$3" # JSON array, e.g. '["kanban-worker","ascii-video"]' + python3 - "$profile" "$toolsets_json" "$skills_json" <<'PY' +import json, os, sys, yaml +profile, ts_json, sk_json = sys.argv[1:4] +p = os.path.expanduser(f"~/.hermes/profiles/{profile}/config.yaml") +with open(p) as f: + cfg = yaml.safe_load(f) or {} +cfg["toolsets"] = json.loads(ts_json) +cfg.setdefault("skills", {})["always_load"] = json.loads(sk_json) +with open(p, "w") as f: + yaml.safe_dump(cfg, f, sort_keys=False) +PY +} +``` + +PyYAML must be installed in the user's Python (it ships with most Hermes +installs). If absent: `pip install pyyaml`. + +The setup script should also **validate** the patch by re-reading the file +and comparing โ€” see `assets/setup.sh.tmpl` for the validation pattern. + +### SOUL.md per profile + +Each profile gets a `SOUL.md` at `~/.hermes/profiles//SOUL.md` that +defines its role, voice, and rules. See `assets/soul.md.tmpl` for the +template. Customize per role and per project. + +The director's SOUL.md should be the most opinionated โ€” its voice flavors +the entire production. **Critical content for the director's SOUL.md:** + +- **Anti-temptation rules:** "Do not execute the work yourself. For every + concrete task, create a kanban task and assign it. Decompose, route, comment, + approve โ€” that's the whole job." (The `kanban-orchestrator` skill provides + the deeper playbook; load it.) +- **Decomposition steps:** Read `brief.md`, `TEAM.md`, `taste/`. Use the team + graph in `TEAM.md` to fan out tasks. +- **The workspace_path rule** (see below). + +Other profiles' SOUL.md is briefer; mostly mechanical: who you are, what you +read, what you produce, what skills/tools to use, where to write outputs. +Most non-director profiles should `always_load: kanban-worker` for the +deeper-than-baseline kanban guidance. + +### Initial kanban task + +The final action of setup.sh is firing the kanban: + +```bash +hermes kanban create "Direct production of