From 7d79ce92ac22c85072981ef28e84abc82b2c679b Mon Sep 17 00:00:00 2001 From: aydnOktay Date: Thu, 5 Mar 2026 16:11:59 +0300 Subject: [PATCH 01/14] Improve type hints and error diagnostics in vision_tools --- tools/vision_tools.py | 48 ++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 456f85583d..0b6d11194d 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -27,14 +27,15 @@ Usage: ) """ +import asyncio +import base64 import json import logging import os -import asyncio import uuid -import base64 from pathlib import Path -from typing import Dict, Any, Optional +from typing import Any, Awaitable, Dict, Optional +from urllib.parse import urlparse import httpx from openai import AsyncOpenAI from agent.auxiliary_client import get_vision_auxiliary_client @@ -73,15 +74,18 @@ def _validate_image_url(url: str) -> bool: """ if not url or not isinstance(url, str): return False - - # Check if it's a valid URL format - if not (url.startswith('http://') or url.startswith('https://')): + + # Basic HTTP/HTTPS URL check + if not (url.startswith("http://") or url.startswith("https://")): return False - - # Check for common image extensions (optional, as URLs may not have extensions) - image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg'] - - return True # Allow all HTTP/HTTPS URLs for flexibility + + # Parse to ensure we at least have a network location; still allow URLs + # without file extensions (e.g. CDN endpoints that redirect to images). + parsed = urlparse(url) + if not parsed.netloc: + return False + + return True # Allow all well-formed HTTP/HTTPS URLs for flexibility async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path: @@ -131,7 +135,12 @@ async def _download_image(image_url: str, destination: Path, max_retries: int = logger.warning("Retrying in %ss...", wait_time) await asyncio.sleep(wait_time) else: - logger.error("Image download failed after %s attempts: %s", max_retries, str(e)[:100]) + logger.error( + "Image download failed after %s attempts: %s", + max_retries, + str(e)[:100], + exc_info=True, + ) raise last_error @@ -188,7 +197,7 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) async def vision_analyze_tool( image_url: str, user_prompt: str, - model: str = DEFAULT_VISION_MODEL + model: str = DEFAULT_VISION_MODEL, ) -> str: """ Analyze an image from a URL or local file path using vision AI. @@ -347,7 +356,7 @@ async def vision_analyze_tool( except Exception as e: error_msg = f"Error analyzing image: {str(e)}" - logger.error("%s", error_msg) + logger.error("%s", error_msg, exc_info=True) # Prepare error response result = { @@ -368,7 +377,9 @@ async def vision_analyze_tool( temp_image_path.unlink() logger.debug("Cleaned up temporary image file") except Exception as cleanup_error: - logger.warning("Could not delete temporary file: %s", cleanup_error) + logger.warning( + "Could not delete temporary file: %s", cleanup_error, exc_info=True + ) def check_vision_requirements() -> bool: @@ -464,10 +475,13 @@ VISION_ANALYZE_SCHEMA = { } -def _handle_vision_analyze(args, **kw): +def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: image_url = args.get("image_url", "") question = args.get("question", "") - full_prompt = f"Fully describe and explain everything about this image, then answer the following question:\n\n{question}" + full_prompt = ( + "Fully describe and explain everything about this image, then answer the " + f"following question:\n\n{question}" + ) model = DEFAULT_VISION_MODEL or "google/gemini-3-flash-preview" return vision_analyze_tool(image_url, full_prompt, model) From 15561ec425a74f26bd2051f562d60ec43f78a050 Mon Sep 17 00:00:00 2001 From: jackx707 Date: Thu, 5 Mar 2026 14:34:36 +0000 Subject: [PATCH 02/14] feat: add WebResearchEnv RL environment for multi-step web research --- datagen-config-examples/web_research.yaml | 46 ++ environments/web_research_env.py | 517 ++++++++++++++++++++++ 2 files changed, 563 insertions(+) create mode 100644 datagen-config-examples/web_research.yaml create mode 100644 environments/web_research_env.py diff --git a/datagen-config-examples/web_research.yaml b/datagen-config-examples/web_research.yaml new file mode 100644 index 0000000000..6275dbed69 --- /dev/null +++ b/datagen-config-examples/web_research.yaml @@ -0,0 +1,46 @@ +# datagen-config-examples/web_research.yaml +# +# Batch data generation config for WebResearchEnv. +# Generates tool-calling trajectories for multi-step web research tasks. +# +# Usage: +# python batch_runner.py \ +# --config datagen-config-examples/web_research.yaml \ +# --run_name web_research_v1 + +environment: web-research + +# Toolsets available to the agent during data generation +toolsets: + - web + - file + +# How many parallel workers to use +num_workers: 4 + +# Questions per batch +batch_size: 20 + +# Total trajectories to generate (comment out to run full dataset) +max_items: 500 + +# Model to use for generation (override with --model flag) +model: openrouter/nousresearch/hermes-3-llama-3.1-405b + +# System prompt additions (ephemeral — not saved to trajectories) +ephemeral_system_prompt: | + You are a highly capable research agent. When asked a factual question, + always use web_search to find current, accurate information before answering. + Cite at least 2 sources. Be concise and accurate. + +# Output directory +output_dir: data/web_research_v1 + +# Trajectory compression settings (for fitting into training token budgets) +compression: + enabled: true + target_max_tokens: 16000 + +# Eval settings +eval_every: 100 # Run eval every N trajectories +eval_size: 25 # Number of held-out questions per eval run diff --git a/environments/web_research_env.py b/environments/web_research_env.py new file mode 100644 index 0000000000..e73eb45c6d --- /dev/null +++ b/environments/web_research_env.py @@ -0,0 +1,517 @@ +""" +WebResearchEnv — RL Environment for Multi-Step Web Research +============================================================ + +Trains models to do accurate, efficient, multi-source web research. + +Reward signals: + - Answer correctness (LLM judge, 0.0–1.0) + - Source diversity (used ≥2 distinct domains) + - Efficiency (penalizes excessive tool calls) + - Tool usage (bonus for actually using web tools) + +Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions + HuggingFace: google/frames-benchmark + Fallback: built-in sample questions (no HF token needed) + +Usage: + # Phase 1 (OpenAI-compatible server) + python environments/web_research_env.py serve \ + --openai.base_url http://localhost:8000/v1 \ + --openai.model_name YourModel \ + --openai.server_type openai + + # With eval split + python environments/web_research_env.py serve \ + --openai.base_url http://localhost:8000/v1 \ + --openai.model_name YourModel \ + --env.eval_every 50 \ + --env.eval_size 20 + + # Standalone eval (no training server needed) + python environments/web_research_env.py eval \ + --openai.base_url http://localhost:8000/v1 \ + --openai.model_name YourModel + +Built by: github.com/jackx707 +Inspired by: GroceryMind — production Hermes agent doing live web research + across German grocery stores (firecrawl + hermes-agent) +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import random +import re +from typing import Any, Optional +from urllib.parse import urlparse + +# --------------------------------------------------------------------------- +# Optional HuggingFace datasets import +# --------------------------------------------------------------------------- +try: + from datasets import load_dataset + HF_AVAILABLE = True +except ImportError: + HF_AVAILABLE = False + +from environments.hermes_base_env import HermesAgentBaseEnv + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Fallback sample dataset (used when HuggingFace is unavailable) +# These are multi-hop questions that require real web search to answer. +# --------------------------------------------------------------------------- +SAMPLE_QUESTIONS = [ + { + "question": "What is the current population of the capital city of the country that won the 2022 FIFA World Cup?", + "answer": "Buenos Aires has approximately 3 million people in the city proper, or around 15 million in the greater metro area.", + "difficulty": "medium", + "hops": 2, + }, + { + "question": "Who is the CEO of the company that makes the most widely used open-source container orchestration platform?", + "answer": "The Linux Foundation oversees Kubernetes. CNCF (Cloud Native Computing Foundation) is the specific body — it does not have a traditional CEO but has an executive director.", + "difficulty": "medium", + "hops": 2, + }, + { + "question": "What programming language was used to write the original version of the web framework used by Instagram?", + "answer": "Django, which Instagram was built on, is written in Python.", + "difficulty": "easy", + "hops": 2, + }, + { + "question": "In what year was the university founded where the inventor of the World Wide Web currently holds a professorship?", + "answer": "Tim Berners-Lee holds a professorship at MIT (founded 1861) and the University of Southampton (founded 1952).", + "difficulty": "hard", + "hops": 3, + }, + { + "question": "What is the latest stable version of the programming language that ranks #1 on the TIOBE index as of this year?", + "answer": "Python is currently #1 on TIOBE. The latest stable version should be verified via the official python.org site.", + "difficulty": "medium", + "hops": 2, + }, + { + "question": "How many employees does the parent company of Instagram have?", + "answer": "Meta Platforms (parent of Instagram) employs approximately 70,000+ people as of recent reports.", + "difficulty": "medium", + "hops": 2, + }, + { + "question": "What is the current interest rate set by the central bank of the country where the Eiffel Tower is located?", + "answer": "The European Central Bank sets rates for France/eurozone. The current rate should be verified — it has changed frequently in 2023-2025.", + "difficulty": "hard", + "hops": 2, + }, + { + "question": "Which company acquired the startup founded by the creator of Oculus VR?", + "answer": "Palmer Luckey founded Oculus VR, which was acquired by Facebook (now Meta). He later founded Anduril Industries.", + "difficulty": "medium", + "hops": 2, + }, + { + "question": "What is the market cap of the company that owns the most popular search engine in Russia?", + "answer": "Yandex (now split into separate entities after 2024 restructuring). Current market cap should be verified via financial sources.", + "difficulty": "hard", + "hops": 2, + }, + { + "question": "What was the GDP growth rate of the country that hosted the most recent Summer Olympics?", + "answer": "Paris, France hosted the 2024 Summer Olympics. France's recent GDP growth should be verified via World Bank or IMF data.", + "difficulty": "hard", + "hops": 2, + }, +] + + +# --------------------------------------------------------------------------- +# Environment +# --------------------------------------------------------------------------- + +class WebResearchEnv(HermesAgentBaseEnv): + """ + RL environment for training multi-step web research skills. + + The model is given a factual question requiring 2-3 hops of web research + and must use web_search / web_extract tools to find and synthesize the answer. + + Reward is multi-signal: + 60% — answer correctness (LLM judge) + 20% — tool usage (did the model actually search the web?) + 20% — efficiency (penalizes >6 tool calls) + + Bonus +0.1 for source diversity (≥2 distinct domains cited). + """ + + name = "web-research" + + # Default toolsets for this environment — web + file for saving notes + default_toolsets = ["web", "file"] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._items: list[dict] = [] + self._eval_items: list[dict] = [] + self._index: int = 0 + self._total_scored: int = 0 + self._total_reward: float = 0.0 + + # ------------------------------------------------------------------ + # 1. Setup — load dataset + # ------------------------------------------------------------------ + + async def setup(self) -> None: + """Load the FRAMES benchmark or fall back to built-in samples.""" + if HF_AVAILABLE: + try: + logger.info("Loading FRAMES benchmark from HuggingFace...") + ds = load_dataset("google/frames-benchmark", split="test") + self._items = [ + { + "question": row["Prompt"], + "answer": row["Answer"], + "difficulty": row.get("reasoning_types", "unknown"), + "hops": 2, + } + for row in ds + ] + # Hold out 10% for eval + eval_size = max(20, len(self._items) // 10) + random.shuffle(self._items) + self._eval_items = self._items[:eval_size] + self._items = self._items[eval_size:] + logger.info( + f"Loaded {len(self._items)} train / {len(self._eval_items)} eval items " + f"from FRAMES benchmark." + ) + return + except Exception as e: + logger.warning(f"Could not load FRAMES from HuggingFace: {e}. Using built-in samples.") + + # Fallback + random.shuffle(SAMPLE_QUESTIONS) + split = max(1, len(SAMPLE_QUESTIONS) * 8 // 10) + self._items = SAMPLE_QUESTIONS[:split] + self._eval_items = SAMPLE_QUESTIONS[split:] + logger.info( + f"Using built-in sample dataset: {len(self._items)} train / " + f"{len(self._eval_items)} eval items." + ) + + # ------------------------------------------------------------------ + # 2. get_next_item — return the next question + # ------------------------------------------------------------------ + + async def get_next_item(self) -> dict: + """Return the next item, cycling through the dataset.""" + if not self._items: + raise RuntimeError("Dataset is empty. Did you call setup()?") + item = self._items[self._index % len(self._items)] + self._index += 1 + return item + + # ------------------------------------------------------------------ + # 3. format_prompt — build the user-facing prompt + # ------------------------------------------------------------------ + + def format_prompt(self, item: dict) -> str: + """ + Format the research question as a task prompt. + Instructs the model to use web search and cite sources. + """ + return ( + f"Research the following question thoroughly using web search. " + f"You MUST search the web to find current, accurate information — " + f"do not rely solely on your training data.\n\n" + f"Question: {item['question']}\n\n" + f"Requirements:\n" + f"- Use web_search and/or web_extract tools to find information\n" + f"- Search at least 2 different sources\n" + f"- Provide a concise, accurate answer (2-4 sentences)\n" + f"- Cite the sources you used" + ) + + # ------------------------------------------------------------------ + # 4. compute_reward — multi-signal scoring + # ------------------------------------------------------------------ + + async def compute_reward( + self, + item: dict, + result: dict, + ctx: Any, # ToolContext + ) -> float: + """ + Multi-signal reward function: + + 0.6 * correctness — LLM judge comparing answer to ground truth + 0.2 * tool_used — binary: did the model use web tools? + 0.2 * efficiency — penalizes wasteful tool usage + +0.1 bonus — source diversity (≥2 distinct domains) + """ + final_response: str = result.get("final_response", "") + tools_used: list[str] = result.get("tools_used", []) + tool_call_count: int = result.get("tool_call_count", len(tools_used)) + + # ---- Signal 1: Answer correctness (LLM judge) ---------------- + correctness = await self._llm_judge( + question=item["question"], + expected=item["answer"], + model_answer=final_response, + ctx=ctx, + ) + + # ---- Signal 2: Web tool usage -------------------------------- + web_tools = {"web_search", "web_extract", "search", "firecrawl"} + tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0 + + # ---- Signal 3: Efficiency ------------------------------------ + # Ideal: 2-5 tool calls. Penalise beyond 6, hard cap at 15. + if tool_call_count <= 5: + efficiency = 1.0 + elif tool_call_count <= 10: + efficiency = 1.0 - (tool_call_count - 5) * 0.08 + else: + efficiency = max(0.0, 1.0 - (tool_call_count - 5) * 0.12) + + # ---- Bonus: Source diversity --------------------------------- + domains = self._extract_domains(final_response) + diversity_bonus = 0.1 if len(domains) >= 2 else 0.0 + + # ---- Combine ------------------------------------------------ + reward = ( + 0.6 * correctness + + 0.2 * tool_used + + 0.2 * efficiency + + diversity_bonus + ) + reward = min(1.0, max(0.0, reward)) # clamp to [0, 1] + + # Track running stats + self._total_scored += 1 + self._total_reward += reward + + logger.debug( + f"Reward breakdown — correctness={correctness:.2f}, " + f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, " + f"diversity_bonus={diversity_bonus:.1f} → total={reward:.3f}" + ) + + return reward + + # ------------------------------------------------------------------ + # 5. evaluate — run on held-out eval split + # ------------------------------------------------------------------ + + async def evaluate( + self, + *args: Any, + eval_size: Optional[int] = None, + **kwargs: Any, + ) -> dict: + """ + Run evaluation on the held-out split. + Returns a dict of metrics for logging. + """ + items = self._eval_items + if eval_size: + items = items[:eval_size] + + if not items: + logger.warning("No eval items available.") + return {} + + logger.info(f"Running eval on {len(items)} questions...") + + rewards = [] + correctness_scores = [] + + for item in items: + try: + # Run the agent on each eval question + result = await self._run_agent_on_item(item) + reward = await self.compute_reward(item, result, ctx=None) + rewards.append(reward) + + # Also track raw correctness separately + if result.get("final_response"): + correctness_scores.append( + await self._llm_judge( + question=item["question"], + expected=item["answer"], + model_answer=result["final_response"], + ctx=None, + ) + ) + except Exception as e: + logger.error(f"Eval error on item: {e}") + rewards.append(0.0) + + metrics = { + "eval/mean_reward": sum(rewards) / len(rewards) if rewards else 0.0, + "eval/mean_correctness": ( + sum(correctness_scores) / len(correctness_scores) + if correctness_scores else 0.0 + ), + "eval/n_items": len(rewards), + "train/mean_reward_so_far": ( + self._total_reward / self._total_scored + if self._total_scored > 0 else 0.0 + ), + } + + logger.info( + f"Eval complete — mean_reward={metrics['eval/mean_reward']:.3f}, " + f"mean_correctness={metrics['eval/mean_correctness']:.3f}" + ) + return metrics + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + async def _llm_judge( + self, + question: str, + expected: str, + model_answer: str, + ctx: Any, + ) -> float: + """ + Use an LLM to judge whether `model_answer` correctly addresses + `question` compared to `expected`. Returns a float in [0, 1]. + + Uses the agent's own inference client if ctx is available, + otherwise falls back to a lightweight heuristic. + """ + if not model_answer or not model_answer.strip(): + return 0.0 + + # Build judge prompt + judge_prompt = ( + "You are an impartial judge evaluating the quality of an AI research answer.\n\n" + f"Question: {question}\n\n" + f"Reference answer: {expected}\n\n" + f"Model answer: {model_answer}\n\n" + "Score the model answer on a scale from 0.0 to 1.0 where:\n" + " 1.0 = fully correct and complete\n" + " 0.7 = mostly correct with minor gaps\n" + " 0.4 = partially correct\n" + " 0.1 = mentions relevant topic but wrong or very incomplete\n" + " 0.0 = completely wrong or no answer\n\n" + "Consider: factual accuracy, completeness, and relevance.\n" + "Respond with ONLY a JSON object: {\"score\": , \"reason\": \"\"}" + ) + + # Try using ctx for inference (Phase 2 / live training) + if ctx is not None and hasattr(ctx, "chat_completion"): + try: + response = await ctx.chat_completion( + messages=[{"role": "user", "content": judge_prompt}], + max_tokens=100, + temperature=0.0, + ) + text = response.get("content", "") + parsed = self._parse_judge_json(text) + if parsed is not None: + return float(parsed) + except Exception as e: + logger.debug(f"LLM judge via ctx failed: {e}. Using heuristic.") + + # Fallback: keyword overlap heuristic + return self._heuristic_score(expected, model_answer) + + @staticmethod + def _parse_judge_json(text: str) -> Optional[float]: + """Extract the score float from LLM judge JSON response.""" + try: + # Strip markdown code fences if present + clean = re.sub(r"```(?:json)?|```", "", text).strip() + data = json.loads(clean) + score = float(data.get("score", -1)) + if 0.0 <= score <= 1.0: + return score + except Exception: + # Try regex fallback + match = re.search(r'"score"\s*:\s*([0-9.]+)', text) + if match: + score = float(match.group(1)) + if 0.0 <= score <= 1.0: + return score + return None + + @staticmethod + def _heuristic_score(expected: str, model_answer: str) -> float: + """ + Lightweight keyword overlap score as fallback when no LLM is available. + Extracts meaningful tokens and computes Jaccard similarity. + """ + stopwords = { + "the", "a", "an", "is", "are", "was", "were", "of", "in", "on", + "at", "to", "for", "with", "and", "or", "but", "it", "its", + "this", "that", "as", "by", "from", "be", "has", "have", "had", + } + + def tokenize(text: str) -> set: + tokens = re.findall(r'\b[a-zA-Z0-9]+\b', text.lower()) + return {t for t in tokens if t not in stopwords and len(t) > 2} + + expected_tokens = tokenize(expected) + answer_tokens = tokenize(model_answer) + + if not expected_tokens: + return 0.5 # Can't judge + + overlap = len(expected_tokens & answer_tokens) + union = len(expected_tokens | answer_tokens) + + jaccard = overlap / union if union > 0 else 0.0 + # Recall-weighted: reward covering expected content + recall = overlap / len(expected_tokens) + return min(1.0, 0.4 * jaccard + 0.6 * recall) + + @staticmethod + def _extract_domains(text: str) -> set: + """ + Extract unique domains from URLs cited in the response. + Used to measure source diversity. + """ + urls = re.findall(r'https?://[^\s\)>\]"\']+', text) + domains = set() + for url in urls: + try: + parsed = urlparse(url) + # Normalize: strip www. + domain = parsed.netloc.lower().lstrip("www.") + if domain: + domains.add(domain) + except Exception: + pass + return domains + + async def _run_agent_on_item(self, item: dict) -> dict: + """ + Stub for running agent during eval. In Phase 1/2, this is handled + by the Atropos framework's rollout mechanism. Provided here for + standalone eval compatibility. + """ + # In real usage, the framework calls get_next_item + format_prompt + # and runs the agent. This stub returns an empty result for safety. + return { + "final_response": "", + "tools_used": [], + "tool_call_count": 0, + } + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + WebResearchEnv.cli() From 36214d14db03cc17f8e16cf7c21333baaca27592 Mon Sep 17 00:00:00 2001 From: PercyDikec Date: Thu, 5 Mar 2026 21:12:53 +0300 Subject: [PATCH 03/14] fix(cli): use correct visibility filter string in codex API model fetch --- hermes_cli/codex_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hermes_cli/codex_models.py b/hermes_cli/codex_models.py index 416c76add5..662e576dc2 100644 --- a/hermes_cli/codex_models.py +++ b/hermes_cli/codex_models.py @@ -47,7 +47,7 @@ def _fetch_models_from_api(access_token: str) -> List[str]: if item.get("supported_in_api") is False: continue visibility = item.get("visibility", "") - if isinstance(visibility, str) and visibility.strip().lower() == "hide": + if isinstance(visibility, str) and visibility.strip().lower() == "hidden": continue priority = item.get("priority") rank = int(priority) if isinstance(priority, (int, float)) else 10_000 From 34e8d088c21f072a6f2fc9ffdaacbcd47e2a324e Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 13:02:59 -0700 Subject: [PATCH 04/14] feat(slack): fix app_mention 404 + add document/video support - Register no-op app_mention event handler to suppress Bolt 404 errors. The 'message' handler already processes @mentions in channels, so app_mention is acknowledged without duplicate processing. - Add send_document() for native file attachments (PDFs, CSVs, etc.) via files_upload_v2, matching the pattern from Telegram PR #779. - Add send_video() for native video uploads via files_upload_v2. - Handle incoming document attachments from users: download, cache, and inject text content for .txt/.md files (capped at 100KB), following the same pattern as the Telegram adapter. - Add _download_slack_file_bytes() helper for raw byte downloads. - Add 24 new tests covering all new functionality. Fixes the unhandled app_mention events reported in gateway logs. --- gateway/platforms/slack.py | 134 +++++++++ tests/gateway/test_slack.py | 532 ++++++++++++++++++++++++++++++++++++ 2 files changed, 666 insertions(+) create mode 100644 tests/gateway/test_slack.py diff --git a/gateway/platforms/slack.py b/gateway/platforms/slack.py index 11a73461e7..020843d3ac 100644 --- a/gateway/platforms/slack.py +++ b/gateway/platforms/slack.py @@ -10,6 +10,7 @@ Uses slack-bolt (Python) with Socket Mode for: import asyncio import os +import re from typing import Dict, List, Optional, Any try: @@ -33,6 +34,8 @@ from gateway.platforms.base import ( MessageEvent, MessageType, SendResult, + SUPPORTED_DOCUMENT_TYPES, + cache_document_from_bytes, cache_image_from_url, cache_audio_from_url, ) @@ -96,6 +99,13 @@ class SlackAdapter(BasePlatformAdapter): async def handle_message_event(event, say): await self._handle_slack_message(event) + # Acknowledge app_mention events to prevent Bolt 404 errors. + # The "message" handler above already processes @mentions in + # channels, so this is intentionally a no-op to avoid duplicates. + @self._app.event("app_mention") + async def handle_app_mention(event, say): + pass + # Register slash command handler @self._app.command("/hermes") async def handle_hermes_command(ack, command): @@ -266,6 +276,65 @@ class SlackAdapter(BasePlatformAdapter): except Exception as e: return SendResult(success=False, error=str(e)) + async def send_video( + self, + chat_id: str, + video_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send a video file to Slack.""" + if not self._app: + return SendResult(success=False, error="Not connected") + + if not os.path.exists(video_path): + return SendResult(success=False, error=f"Video file not found: {video_path}") + + try: + result = await self._app.client.files_upload_v2( + channel=chat_id, + file=video_path, + filename=os.path.basename(video_path), + initial_comment=caption or "", + thread_ts=reply_to, + ) + return SendResult(success=True, raw_response=result) + + except Exception as e: + print(f"[{self.name}] Failed to send video: {e}") + return await super().send_video(chat_id, video_path, caption, reply_to) + + async def send_document( + self, + chat_id: str, + file_path: str, + caption: Optional[str] = None, + file_name: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send a document/file attachment to Slack.""" + if not self._app: + return SendResult(success=False, error="Not connected") + + if not os.path.exists(file_path): + return SendResult(success=False, error=f"File not found: {file_path}") + + display_name = file_name or os.path.basename(file_path) + + try: + result = await self._app.client.files_upload_v2( + channel=chat_id, + file=file_path, + filename=display_name, + initial_comment=caption or "", + thread_ts=reply_to, + ) + return SendResult(success=True, raw_response=result) + + except Exception as e: + print(f"[{self.name}] Failed to send document: {e}") + return await super().send_document(chat_id, file_path, caption, file_name, reply_to) + async def get_chat_info(self, chat_id: str) -> Dict[str, Any]: """Get information about a Slack channel.""" if not self._app: @@ -347,6 +416,58 @@ class SlackAdapter(BasePlatformAdapter): msg_type = MessageType.VOICE except Exception as e: print(f"[Slack] Failed to cache audio: {e}", flush=True) + elif url: + # Try to handle as a document attachment + try: + original_filename = f.get("name", "") + ext = "" + if original_filename: + _, ext = os.path.splitext(original_filename) + ext = ext.lower() + + # Fallback: reverse-lookup from MIME type + if not ext and mimetype: + mime_to_ext = {v: k for k, v in SUPPORTED_DOCUMENT_TYPES.items()} + ext = mime_to_ext.get(mimetype, "") + + if ext not in SUPPORTED_DOCUMENT_TYPES: + continue # Skip unsupported file types silently + + # Check file size (Slack limit: 20 MB for bots) + file_size = f.get("size", 0) + MAX_DOC_BYTES = 20 * 1024 * 1024 + if not file_size or file_size > MAX_DOC_BYTES: + print(f"[Slack] Document too large or unknown size: {file_size}", flush=True) + continue + + # Download and cache + raw_bytes = await self._download_slack_file_bytes(url) + cached_path = cache_document_from_bytes( + raw_bytes, original_filename or f"document{ext}" + ) + doc_mime = SUPPORTED_DOCUMENT_TYPES[ext] + media_urls.append(cached_path) + media_types.append(doc_mime) + msg_type = MessageType.DOCUMENT + print(f"[Slack] Cached user document: {cached_path}", flush=True) + + # Inject text content for .txt/.md files (capped at 100 KB) + MAX_TEXT_INJECT_BYTES = 100 * 1024 + if ext in (".md", ".txt") and len(raw_bytes) <= MAX_TEXT_INJECT_BYTES: + try: + text_content = raw_bytes.decode("utf-8") + display_name = original_filename or f"document{ext}" + display_name = re.sub(r'[^\w.\- ]', '_', display_name) + injection = f"[Content of {display_name}]:\n{text_content}" + if text: + text = f"{injection}\n\n{text}" + else: + text = injection + except UnicodeDecodeError: + pass # Binary content, skip injection + + except Exception as e: + print(f"[Slack] Failed to cache document: {e}", flush=True) # Build source source = self.build_source( @@ -427,3 +548,16 @@ class SlackAdapter(BasePlatformAdapter): else: from gateway.platforms.base import cache_image_from_bytes return cache_image_from_bytes(response.content, ext) + + async def _download_slack_file_bytes(self, url: str) -> bytes: + """Download a Slack file and return raw bytes.""" + import httpx + + bot_token = self.config.token + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: + response = await client.get( + url, + headers={"Authorization": f"Bearer {bot_token}"}, + ) + response.raise_for_status() + return response.content diff --git a/tests/gateway/test_slack.py b/tests/gateway/test_slack.py new file mode 100644 index 0000000000..efdb62ce49 --- /dev/null +++ b/tests/gateway/test_slack.py @@ -0,0 +1,532 @@ +""" +Tests for Slack platform adapter. + +Covers: app_mention handler, send_document, send_video, + incoming document handling, message routing. + +Note: slack-bolt may not be installed in the test environment. +We mock the slack modules at import time to avoid collection errors. +""" + +import asyncio +import os +import sys +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from gateway.config import Platform, PlatformConfig +from gateway.platforms.base import ( + MessageEvent, + MessageType, + SendResult, + SUPPORTED_DOCUMENT_TYPES, +) + + +# --------------------------------------------------------------------------- +# Mock the slack-bolt package if it's not installed +# --------------------------------------------------------------------------- + +def _ensure_slack_mock(): + """Install mock slack modules so SlackAdapter can be imported.""" + if "slack_bolt" in sys.modules and hasattr(sys.modules["slack_bolt"], "__file__"): + return # Real library installed + + slack_bolt = MagicMock() + slack_bolt.async_app.AsyncApp = MagicMock + slack_bolt.adapter.socket_mode.async_handler.AsyncSocketModeHandler = MagicMock + + slack_sdk = MagicMock() + slack_sdk.web.async_client.AsyncWebClient = MagicMock + + for name, mod in [ + ("slack_bolt", slack_bolt), + ("slack_bolt.async_app", slack_bolt.async_app), + ("slack_bolt.adapter", slack_bolt.adapter), + ("slack_bolt.adapter.socket_mode", slack_bolt.adapter.socket_mode), + ("slack_bolt.adapter.socket_mode.async_handler", slack_bolt.adapter.socket_mode.async_handler), + ("slack_sdk", slack_sdk), + ("slack_sdk.web", slack_sdk.web), + ("slack_sdk.web.async_client", slack_sdk.web.async_client), + ]: + sys.modules.setdefault(name, mod) + + +_ensure_slack_mock() + +# Patch SLACK_AVAILABLE before importing the adapter +import gateway.platforms.slack as _slack_mod +_slack_mod.SLACK_AVAILABLE = True + +from gateway.platforms.slack import SlackAdapter # noqa: E402 + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture() +def adapter(): + config = PlatformConfig(enabled=True, token="xoxb-fake-token") + a = SlackAdapter(config) + # Mock the Slack app client + a._app = MagicMock() + a._app.client = AsyncMock() + a._bot_user_id = "U_BOT" + a._running = True + # Capture events instead of processing them + a.handle_message = AsyncMock() + return a + + +@pytest.fixture(autouse=True) +def _redirect_cache(tmp_path, monkeypatch): + """Point document cache to tmp_path so tests don't touch ~/.hermes.""" + monkeypatch.setattr( + "gateway.platforms.base.DOCUMENT_CACHE_DIR", tmp_path / "doc_cache" + ) + + +# --------------------------------------------------------------------------- +# TestAppMentionHandler +# --------------------------------------------------------------------------- + +class TestAppMentionHandler: + """Verify that the app_mention event handler is registered.""" + + def test_app_mention_registered_on_connect(self): + """connect() should register both 'message' and 'app_mention' handlers.""" + config = PlatformConfig(enabled=True, token="xoxb-fake") + adapter = SlackAdapter(config) + + # Track which events get registered + registered_events = [] + registered_commands = [] + + mock_app = MagicMock() + + def mock_event(event_type): + def decorator(fn): + registered_events.append(event_type) + return fn + return decorator + + def mock_command(cmd): + def decorator(fn): + registered_commands.append(cmd) + return fn + return decorator + + mock_app.event = mock_event + mock_app.command = mock_command + mock_app.client = AsyncMock() + mock_app.client.auth_test = AsyncMock(return_value={ + "user_id": "U_BOT", + "user": "testbot", + }) + + with patch.object(_slack_mod, "AsyncApp", return_value=mock_app), \ + patch.object(_slack_mod, "AsyncSocketModeHandler", return_value=MagicMock()), \ + patch.dict(os.environ, {"SLACK_APP_TOKEN": "xapp-fake"}), \ + patch("asyncio.create_task"): + asyncio.get_event_loop().run_until_complete(adapter.connect()) + + assert "message" in registered_events + assert "app_mention" in registered_events + assert "/hermes" in registered_commands + + +# --------------------------------------------------------------------------- +# TestSendDocument +# --------------------------------------------------------------------------- + +class TestSendDocument: + @pytest.mark.asyncio + async def test_send_document_success(self, adapter, tmp_path): + test_file = tmp_path / "report.pdf" + test_file.write_bytes(b"%PDF-1.4 fake content") + + adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True}) + + result = await adapter.send_document( + chat_id="C123", + file_path=str(test_file), + caption="Here's the report", + ) + + assert result.success + adapter._app.client.files_upload_v2.assert_called_once() + call_kwargs = adapter._app.client.files_upload_v2.call_args[1] + assert call_kwargs["channel"] == "C123" + assert call_kwargs["file"] == str(test_file) + assert call_kwargs["filename"] == "report.pdf" + assert call_kwargs["initial_comment"] == "Here's the report" + + @pytest.mark.asyncio + async def test_send_document_custom_name(self, adapter, tmp_path): + test_file = tmp_path / "data.csv" + test_file.write_bytes(b"a,b,c\n1,2,3") + + adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True}) + + result = await adapter.send_document( + chat_id="C123", + file_path=str(test_file), + file_name="quarterly-report.csv", + ) + + assert result.success + call_kwargs = adapter._app.client.files_upload_v2.call_args[1] + assert call_kwargs["filename"] == "quarterly-report.csv" + + @pytest.mark.asyncio + async def test_send_document_missing_file(self, adapter): + result = await adapter.send_document( + chat_id="C123", + file_path="/nonexistent/file.pdf", + ) + + assert not result.success + assert "not found" in result.error.lower() + + @pytest.mark.asyncio + async def test_send_document_not_connected(self, adapter): + adapter._app = None + result = await adapter.send_document( + chat_id="C123", + file_path="/some/file.pdf", + ) + + assert not result.success + assert "Not connected" in result.error + + @pytest.mark.asyncio + async def test_send_document_api_error_falls_back(self, adapter, tmp_path): + test_file = tmp_path / "doc.pdf" + test_file.write_bytes(b"content") + + adapter._app.client.files_upload_v2 = AsyncMock( + side_effect=RuntimeError("Slack API error") + ) + + # Should fall back to base class (text message) + result = await adapter.send_document( + chat_id="C123", + file_path=str(test_file), + ) + + # Base class send() is also mocked, so check it was attempted + adapter._app.client.chat_postMessage.assert_called_once() + + @pytest.mark.asyncio + async def test_send_document_with_thread(self, adapter, tmp_path): + test_file = tmp_path / "notes.txt" + test_file.write_bytes(b"some notes") + + adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True}) + + result = await adapter.send_document( + chat_id="C123", + file_path=str(test_file), + reply_to="1234567890.123456", + ) + + assert result.success + call_kwargs = adapter._app.client.files_upload_v2.call_args[1] + assert call_kwargs["thread_ts"] == "1234567890.123456" + + +# --------------------------------------------------------------------------- +# TestSendVideo +# --------------------------------------------------------------------------- + +class TestSendVideo: + @pytest.mark.asyncio + async def test_send_video_success(self, adapter, tmp_path): + video = tmp_path / "clip.mp4" + video.write_bytes(b"fake video data") + + adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True}) + + result = await adapter.send_video( + chat_id="C123", + video_path=str(video), + caption="Check this out", + ) + + assert result.success + call_kwargs = adapter._app.client.files_upload_v2.call_args[1] + assert call_kwargs["filename"] == "clip.mp4" + assert call_kwargs["initial_comment"] == "Check this out" + + @pytest.mark.asyncio + async def test_send_video_missing_file(self, adapter): + result = await adapter.send_video( + chat_id="C123", + video_path="/nonexistent/video.mp4", + ) + + assert not result.success + assert "not found" in result.error.lower() + + @pytest.mark.asyncio + async def test_send_video_not_connected(self, adapter): + adapter._app = None + result = await adapter.send_video( + chat_id="C123", + video_path="/some/video.mp4", + ) + + assert not result.success + assert "Not connected" in result.error + + @pytest.mark.asyncio + async def test_send_video_api_error_falls_back(self, adapter, tmp_path): + video = tmp_path / "clip.mp4" + video.write_bytes(b"fake video") + + adapter._app.client.files_upload_v2 = AsyncMock( + side_effect=RuntimeError("Slack API error") + ) + + # Should fall back to base class (text message) + result = await adapter.send_video( + chat_id="C123", + video_path=str(video), + ) + + adapter._app.client.chat_postMessage.assert_called_once() + + +# --------------------------------------------------------------------------- +# TestIncomingDocumentHandling +# --------------------------------------------------------------------------- + +class TestIncomingDocumentHandling: + def _make_event(self, files=None, text="hello", channel_type="im"): + """Build a mock Slack message event with file attachments.""" + return { + "text": text, + "user": "U_USER", + "channel": "C123", + "channel_type": channel_type, + "ts": "1234567890.000001", + "files": files or [], + } + + @pytest.mark.asyncio + async def test_pdf_document_cached(self, adapter): + """A PDF attachment should be downloaded, cached, and set as DOCUMENT type.""" + pdf_bytes = b"%PDF-1.4 fake content" + + with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl: + dl.return_value = pdf_bytes + event = self._make_event(files=[{ + "mimetype": "application/pdf", + "name": "report.pdf", + "url_private_download": "https://files.slack.com/report.pdf", + "size": len(pdf_bytes), + }]) + await adapter._handle_slack_message(event) + + msg_event = adapter.handle_message.call_args[0][0] + assert msg_event.message_type == MessageType.DOCUMENT + assert len(msg_event.media_urls) == 1 + assert os.path.exists(msg_event.media_urls[0]) + assert msg_event.media_types == ["application/pdf"] + + @pytest.mark.asyncio + async def test_txt_document_injects_content(self, adapter): + """A .txt file under 100KB should have its content injected into event text.""" + content = b"Hello from a text file" + + with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl: + dl.return_value = content + event = self._make_event( + text="summarize this", + files=[{ + "mimetype": "text/plain", + "name": "notes.txt", + "url_private_download": "https://files.slack.com/notes.txt", + "size": len(content), + }], + ) + await adapter._handle_slack_message(event) + + msg_event = adapter.handle_message.call_args[0][0] + assert "Hello from a text file" in msg_event.text + assert "[Content of notes.txt]" in msg_event.text + assert "summarize this" in msg_event.text + + @pytest.mark.asyncio + async def test_md_document_injects_content(self, adapter): + """A .md file under 100KB should have its content injected.""" + content = b"# Title\nSome markdown content" + + with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl: + dl.return_value = content + event = self._make_event(files=[{ + "mimetype": "text/markdown", + "name": "readme.md", + "url_private_download": "https://files.slack.com/readme.md", + "size": len(content), + }], text="") + await adapter._handle_slack_message(event) + + msg_event = adapter.handle_message.call_args[0][0] + assert "# Title" in msg_event.text + + @pytest.mark.asyncio + async def test_large_txt_not_injected(self, adapter): + """A .txt file over 100KB should be cached but NOT injected.""" + content = b"x" * (200 * 1024) + + with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl: + dl.return_value = content + event = self._make_event(files=[{ + "mimetype": "text/plain", + "name": "big.txt", + "url_private_download": "https://files.slack.com/big.txt", + "size": len(content), + }], text="") + await adapter._handle_slack_message(event) + + msg_event = adapter.handle_message.call_args[0][0] + assert len(msg_event.media_urls) == 1 + assert "[Content of" not in (msg_event.text or "") + + @pytest.mark.asyncio + async def test_unsupported_file_type_skipped(self, adapter): + """A .zip file should be silently skipped.""" + event = self._make_event(files=[{ + "mimetype": "application/zip", + "name": "archive.zip", + "url_private_download": "https://files.slack.com/archive.zip", + "size": 1024, + }]) + await adapter._handle_slack_message(event) + + msg_event = adapter.handle_message.call_args[0][0] + assert msg_event.message_type == MessageType.TEXT + assert len(msg_event.media_urls) == 0 + + @pytest.mark.asyncio + async def test_oversized_document_skipped(self, adapter): + """A document over 20MB should be skipped.""" + event = self._make_event(files=[{ + "mimetype": "application/pdf", + "name": "huge.pdf", + "url_private_download": "https://files.slack.com/huge.pdf", + "size": 25 * 1024 * 1024, + }]) + await adapter._handle_slack_message(event) + + msg_event = adapter.handle_message.call_args[0][0] + assert len(msg_event.media_urls) == 0 + + @pytest.mark.asyncio + async def test_document_download_error_handled(self, adapter): + """If document download fails, handler should not crash.""" + with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl: + dl.side_effect = RuntimeError("download failed") + event = self._make_event(files=[{ + "mimetype": "application/pdf", + "name": "report.pdf", + "url_private_download": "https://files.slack.com/report.pdf", + "size": 1024, + }]) + await adapter._handle_slack_message(event) + + # Handler should still be called (the exception is caught) + adapter.handle_message.assert_called_once() + + @pytest.mark.asyncio + async def test_image_still_handled(self, adapter): + """Image attachments should still go through the image path, not document.""" + with patch.object(adapter, "_download_slack_file", new_callable=AsyncMock) as dl: + dl.return_value = "/tmp/cached_image.jpg" + event = self._make_event(files=[{ + "mimetype": "image/jpeg", + "name": "photo.jpg", + "url_private_download": "https://files.slack.com/photo.jpg", + "size": 1024, + }]) + await adapter._handle_slack_message(event) + + msg_event = adapter.handle_message.call_args[0][0] + assert msg_event.message_type == MessageType.PHOTO + + +# --------------------------------------------------------------------------- +# TestMessageRouting +# --------------------------------------------------------------------------- + +class TestMessageRouting: + @pytest.mark.asyncio + async def test_dm_processed_without_mention(self, adapter): + """DM messages should be processed without requiring a bot mention.""" + event = { + "text": "hello", + "user": "U_USER", + "channel": "D123", + "channel_type": "im", + "ts": "1234567890.000001", + } + await adapter._handle_slack_message(event) + adapter.handle_message.assert_called_once() + + @pytest.mark.asyncio + async def test_channel_message_requires_mention(self, adapter): + """Channel messages without a bot mention should be ignored.""" + event = { + "text": "just talking", + "user": "U_USER", + "channel": "C123", + "channel_type": "channel", + "ts": "1234567890.000001", + } + await adapter._handle_slack_message(event) + adapter.handle_message.assert_not_called() + + @pytest.mark.asyncio + async def test_channel_mention_strips_bot_id(self, adapter): + """When mentioned in a channel, the bot mention should be stripped.""" + event = { + "text": "<@U_BOT> what's the weather?", + "user": "U_USER", + "channel": "C123", + "channel_type": "channel", + "ts": "1234567890.000001", + } + await adapter._handle_slack_message(event) + msg_event = adapter.handle_message.call_args[0][0] + assert msg_event.text == "what's the weather?" + assert "<@U_BOT>" not in msg_event.text + + @pytest.mark.asyncio + async def test_bot_messages_ignored(self, adapter): + """Messages from bots should be ignored.""" + event = { + "text": "bot response", + "bot_id": "B_OTHER", + "channel": "C123", + "channel_type": "im", + "ts": "1234567890.000001", + } + await adapter._handle_slack_message(event) + adapter.handle_message.assert_not_called() + + @pytest.mark.asyncio + async def test_message_edits_ignored(self, adapter): + """Message edits should be ignored.""" + event = { + "text": "edited message", + "user": "U_USER", + "channel": "C123", + "channel_type": "im", + "ts": "1234567890.000001", + "subtype": "message_changed", + } + await adapter._handle_slack_message(event) + adapter.handle_message.assert_not_called() From ac58309dbdb363692a4bd853364533244620e548 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 14:00:11 -0700 Subject: [PATCH 05/14] docs: improve Slack setup guide with channel event subscriptions and scopes The #1 support issue with Slack is 'bot works in DMs but not channels'. This is almost always caused by missing event subscriptions (message.channels, message.groups) or missing OAuth scopes (channels:history, groups:history). Changes: - slack.md: Move channels:history and groups:history from optional to required scopes. Move message.channels and message.groups to required events. Add new 'How the Bot Responds' section explaining DM vs channel behavior. Add Step 8 for inviting bot to channels. Expand troubleshooting table with specific 'works in DMs not channels' entry. Add quick checklist for channel debugging. - setup.py: Expand Slack setup wizard with all required scopes, event subscriptions, and a warning that without message.channels/message.groups the bot only works in DMs. Add link to full docs. Improve Member ID discovery instructions. - config.py: Update SLACK_BOT_TOKEN and SLACK_APP_TOKEN descriptions to list required scopes and event subscriptions inline. --- hermes_cli/config.py | 8 +- hermes_cli/setup.py | 22 ++++-- website/docs/user-guide/messaging/slack.md | 89 +++++++++++++++++----- 3 files changed, 95 insertions(+), 24 deletions(-) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 7a31b551d4..7b689d764c 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -401,14 +401,18 @@ OPTIONAL_ENV_VARS = { "category": "messaging", }, "SLACK_BOT_TOKEN": { - "description": "Slack bot integration", + "description": "Slack bot token (xoxb-). Get from OAuth & Permissions after installing your app. " + "Required scopes: chat:write, app_mentions:read, channels:history, groups:history, " + "im:history, im:read, im:write, users:read, files:write", "prompt": "Slack Bot Token (xoxb-...)", "url": "https://api.slack.com/apps", "password": True, "category": "messaging", }, "SLACK_APP_TOKEN": { - "description": "Slack Socket Mode connection", + "description": "Slack app-level token (xapp-) for Socket Mode. Get from Basic Information → " + "App-Level Tokens. Also ensure Event Subscriptions include: message.im, " + "message.channels, message.groups, app_mention", "prompt": "Slack App Token (xapp-...)", "url": "https://api.slack.com/apps", "password": True, diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index c10caec9b0..5880b7ef35 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -1572,10 +1572,22 @@ def setup_gateway(config: dict): if not existing_slack and prompt_yes_no("Set up Slack bot?", False): print_info("Steps to create a Slack app:") - print_info(" 1. Go to https://api.slack.com/apps → Create New App") - print_info(" 2. Enable Socket Mode: App Settings → Socket Mode → Enable") - print_info(" 3. Bot Token: OAuth & Permissions → Install to Workspace") - print_info(" 4. App Token: Basic Information → App-Level Tokens → Generate") + print_info(" 1. Go to https://api.slack.com/apps → Create New App (from scratch)") + print_info(" 2. Enable Socket Mode: Settings → Socket Mode → Enable") + print_info(" • Create an App-Level Token with 'connections:write' scope") + print_info(" 3. Add Bot Token Scopes: Features → OAuth & Permissions") + print_info(" Required scopes: chat:write, app_mentions:read,") + print_info(" channels:history, channels:read, groups:history,") + print_info(" im:history, im:read, im:write, users:read, files:write") + print_info(" 4. Subscribe to Events: Features → Event Subscriptions → Enable") + print_info(" Required events: message.im, message.channels,") + print_info(" message.groups, app_mention") + print_warning(" ⚠ Without message.channels/message.groups events,") + print_warning(" the bot will ONLY work in DMs, not channels!") + print_info(" 5. Install to Workspace: Settings → Install App") + print_info(" 6. After installing, invite the bot to channels: /invite @YourBot") + print() + print_info(" Full guide: https://hermes-agent.ai/docs/user-guide/messaging/slack") print() bot_token = prompt("Slack Bot Token (xoxb-...)", password=True) if bot_token: @@ -1587,7 +1599,7 @@ def setup_gateway(config: dict): print() print_info("🔒 Security: Restrict who can use your bot") - print_info(" Find Slack user IDs in your profile or via the Slack API") + print_info(" To find a Member ID: click a user's name → View full profile → ⋮ → Copy member ID") print() allowed_users = prompt("Allowed user IDs (comma-separated, leave empty for open access)") if allowed_users: diff --git a/website/docs/user-guide/messaging/slack.md b/website/docs/user-guide/messaging/slack.md index 52dde5f6a9..65d27ee830 100644 --- a/website/docs/user-guide/messaging/slack.md +++ b/website/docs/user-guide/messaging/slack.md @@ -46,20 +46,26 @@ Navigate to **Features → OAuth & Permissions** in the sidebar. Scroll to **Sco | Scope | Purpose | |-------|---------| | `chat:write` | Send messages as the bot | -| `app_mentions:read` | Respond when @mentioned in channels | +| `app_mentions:read` | Detect when @mentioned in channels | | `channels:history` | Read messages in public channels the bot is in | | `channels:read` | List and get info about public channels | +| `groups:history` | Read messages in private channels the bot is invited to | | `im:history` | Read direct message history | | `im:read` | View basic DM info | | `im:write` | Open and manage DMs | | `users:read` | Look up user information | +| `files:write` | Upload files (images, audio, documents) | + +:::caution Missing scopes = missing features +Without `channels:history` and `groups:history`, the bot **will not receive messages in channels** — +it will only work in DMs. These are the most commonly missed scopes. +::: **Optional scopes:** | Scope | Purpose | |-------|---------| -| `groups:history` | Read messages in private channels the bot is invited to | -| `files:write` | Upload files (audio, images) | +| `groups:read` | List and get info about private channels | --- @@ -83,23 +89,27 @@ You can always find or regenerate app-level tokens under **Settings → Basic In ## Step 4: Subscribe to Events +This step is critical — it controls what messages the bot can see. + 1. In the sidebar, go to **Features → Event Subscriptions** 2. Toggle **Enable Events** to ON 3. Expand **Subscribe to bot events** and add: -| Event | Purpose | -|-------|---------| -| `app_mention` | Bot responds when @mentioned in any channel | -| `message.im` | Bot responds to direct messages | - -**Optional event:** - -| Event | Purpose | -|-------|---------| -| `message.channels` | Bot sees all messages in public channels it's added to | +| Event | Required? | Purpose | +|-------|-----------|---------| +| `message.im` | **Yes** | Bot receives direct messages | +| `message.channels` | **Yes** | Bot receives messages in **public** channels it's added to | +| `message.groups` | **Recommended** | Bot receives messages in **private** channels it's invited to | +| `app_mention` | **Yes** | Prevents Bolt SDK errors when bot is @mentioned | 4. Click **Save Changes** at the bottom of the page +:::danger Missing event subscriptions is the #1 setup issue +If the bot works in DMs but **not in channels**, you almost certainly forgot to add +`message.channels` (for public channels) and/or `message.groups` (for private channels). +Without these events, Slack simply never delivers channel messages to the bot. +::: + --- ## Step 5: Install App to Workspace @@ -111,8 +121,8 @@ You can always find or regenerate app-level tokens under **Settings → Basic In 5. **Copy this token** — this is your `SLACK_BOT_TOKEN` :::tip -If you change scopes later, you'll need to **reinstall the app** for the new scopes to take effect. -The Install App page will show a banner prompting you to do so. +If you change scopes or event subscriptions later, you **must reinstall the app** for the changes +to take effect. The Install App page will show a banner prompting you to do so. ::: --- @@ -139,7 +149,7 @@ Add the following to your `~/.hermes/.env` file: ```bash # Required SLACK_BOT_TOKEN=xoxb-your-bot-token-here -SLACK_APP_TOKEN=xapp-your-app-level-token-here +SLACK_APP_TOKEN=xapp-your-app-token-here SLACK_ALLOWED_USERS=U01ABC2DEF3 # Comma-separated Member IDs # Optional @@ -161,6 +171,35 @@ hermes gateway install # Install as a system service --- +## Step 8: Invite the Bot to Channels + +After starting the gateway, you need to **invite the bot** to any channel where you want it to respond: + +``` +/invite @Hermes Agent +``` + +The bot will **not** automatically join channels. You must invite it to each channel individually. + +--- + +## How the Bot Responds + +Understanding how Hermes behaves in different contexts: + +| Context | Behavior | +|---------|----------| +| **DMs** | Bot responds to every message — no @mention needed | +| **Channels** | Bot **only responds when @mentioned** (e.g., `@Hermes Agent what time is it?`) | +| **Threads** | Bot replies in threads when the triggering message is in a thread | + +:::tip +In channels, always @mention the bot. Simply typing a message without mentioning it will be ignored. +This is intentional — it prevents the bot from responding to every message in busy channels. +::: + +--- + ## Home Channel Set `SLACK_HOME_CHANNEL` to a channel ID where Hermes will deliver scheduled messages, @@ -192,11 +231,27 @@ Hermes supports voice on Slack: | Problem | Solution | |---------|----------| | Bot doesn't respond to DMs | Verify `message.im` is in your event subscriptions and the app is reinstalled | -| Bot doesn't respond to @mentions | Verify `app_mention` is in your event subscriptions | +| Bot works in DMs but not in channels | **Most common issue.** Add `message.channels` and `message.groups` to event subscriptions, reinstall the app, and invite the bot to the channel with `/invite @Hermes Agent` | +| Bot doesn't respond to @mentions in channels | 1) Check `message.channels` event is subscribed. 2) Bot must be invited to the channel. 3) Ensure `channels:history` scope is added. 4) Reinstall the app after scope/event changes | +| Bot ignores messages in private channels | Add both the `message.groups` event subscription and `groups:history` scope, then reinstall the app and `/invite` the bot | | "not_authed" or "invalid_auth" errors | Regenerate your Bot Token and App Token, update `.env` | | Bot responds but can't post in a channel | Invite the bot to the channel with `/invite @Hermes Agent` | | "missing_scope" error | Add the required scope in OAuth & Permissions, then **reinstall** the app | | Socket disconnects frequently | Check your network; Bolt auto-reconnects but unstable connections cause lag | +| Changed scopes/events but nothing changed | You **must reinstall** the app to your workspace after any scope or event subscription change | + +### Quick Checklist + +If the bot isn't working in channels, verify **all** of the following: + +1. ✅ `message.channels` event is subscribed (for public channels) +2. ✅ `message.groups` event is subscribed (for private channels) +3. ✅ `app_mention` event is subscribed +4. ✅ `channels:history` scope is added (for public channels) +5. ✅ `groups:history` scope is added (for private channels) +6. ✅ App was **reinstalled** after adding scopes/events +7. ✅ Bot was **invited** to the channel (`/invite @Hermes Agent`) +8. ✅ You are **@mentioning** the bot in your message --- From 64bec1d06040a503202a05538afbdb6cc8713be8 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 14:31:19 -0700 Subject: [PATCH 06/14] fix: Slack gateway setup missing event subscriptions and scopes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'hermes gateway setup' instructions for Slack were missing: - The 'Subscribe to Events' step entirely (message.im, message.channels, app_mention, message.groups) - Several required scopes (app_mentions:read, groups:history, users:read, files:write) - Warning about bot only working in DMs without message.channels - Step to invite the bot to channels The 'hermes setup' flow (setup.py) and the website docs (slack.md) already had the correct information — only gateway.py was outdated. Reported by JordanB on Slack. --- hermes_cli/gateway.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 64fe551bef..3d146546da 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -482,14 +482,19 @@ _PLATFORMS = [ "token_var": "SLACK_BOT_TOKEN", "setup_instructions": [ "1. Go to https://api.slack.com/apps → Create New App → From Scratch", - "2. Enable Socket Mode: App Settings → Socket Mode → Enable", - "3. Get Bot Token: OAuth & Permissions → Install to Workspace → copy xoxb-... token", - "4. Get App Token: Basic Information → App-Level Tokens → Generate", - " Name it anything, add scope: connections:write → copy xapp-... token", - "5. Add bot scopes: OAuth & Permissions → Scopes → chat:write, im:history,", - " im:read, im:write, channels:history, channels:read", - "6. Reinstall the app to your workspace after adding scopes", + "2. Enable Socket Mode: Settings → Socket Mode → Enable", + " Create an App-Level Token with scope: connections:write → copy xapp-... token", + "3. Add Bot Token Scopes: Features → OAuth & Permissions → Scopes", + " Required: chat:write, app_mentions:read, channels:history, channels:read,", + " groups:history, im:history, im:read, im:write, users:read, files:write", + "4. Subscribe to Events: Features → Event Subscriptions → Enable", + " Required events: message.im, message.channels, app_mention", + " Optional: message.groups (for private channels)", + " ⚠ Without message.channels the bot will ONLY work in DMs!", + "5. Install to Workspace: Settings → Install App → copy xoxb-... token", + "6. Reinstall the app after any scope or event changes", "7. Find your user ID: click your profile → three dots → Copy member ID", + "8. Invite the bot to channels: /invite @YourBot", ], "vars": [ {"name": "SLACK_BOT_TOKEN", "prompt": "Bot Token (xoxb-...)", "password": True, From 520aec20e06c1d11ca443f1753c25ddfe1d3d993 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 15:12:54 -0700 Subject: [PATCH 07/14] fix: add mcp to dev dependencies for test suite MCP tests import from mcp.types but mcp wasn't in the dev optional dependencies. Fresh 'pip install -e .[dev]' setups failed 3 tests. Based on PR #427 by @teyrebaz33 (applied manually due to stale branch). --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5f86cabd2f..01bdaf7e23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ [project.optional-dependencies] modal = ["swe-rex[modal]>=1.4.0"] daytona = ["daytona>=0.148.0"] -dev = ["pytest", "pytest-asyncio"] +dev = ["pytest", "pytest-asyncio", "mcp>=1.2.0"] messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0", "slack-bolt>=1.18.0", "slack-sdk>=3.27.0"] cron = ["croniter"] slack = ["slack-bolt>=1.18.0", "slack-sdk>=3.27.0"] From fa2e72ae9c61a231445f28114b4f63f957e59dd1 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 15:29:34 -0700 Subject: [PATCH 08/14] docs: document docker_volumes config for shared host directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Docker backend already supports user-configured volume mounts via docker_volumes, but it was undocumented — missing from DEFAULT_CONFIG, cli.py defaults, and configuration docs. Changes: - hermes_cli/config.py: Add docker_volumes to DEFAULT_CONFIG with inline documentation and examples - cli.py: Add docker_volumes to load_cli_config defaults - configuration.md: Full Docker Volume Mounts section with YAML examples, use cases (providing files, receiving outputs, shared workspaces), and env var alternative --- cli.py | 1 + hermes_cli/config.py | 4 +++ website/docs/user-guide/configuration.md | 32 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/cli.py b/cli.py index 61cb8d966c..c82e85dc86 100755 --- a/cli.py +++ b/cli.py @@ -158,6 +158,7 @@ def load_cli_config() -> Dict[str, Any]: "singularity_image": "docker://python:3.11", "modal_image": "python:3.11", "daytona_image": "nikolaik/python-nodejs:python3.11-nodejs20", + "docker_volumes": [], # host:container volume mounts for Docker backend }, "browser": { "inactivity_timeout": 120, # Auto-cleanup inactive browser sessions after 2 min diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 7b689d764c..018ac6557f 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -77,6 +77,10 @@ DEFAULT_CONFIG = { "container_memory": 5120, # MB (default 5GB) "container_disk": 51200, # MB (default 50GB) "container_persistent": True, # Persist filesystem across sessions + # Docker volume mounts — share host directories with the container. + # Each entry is "host_path:container_path" (standard Docker -v syntax). + # Example: ["/home/user/projects:/workspace/projects", "/data:/data"] + "docker_volumes": [], }, "browser": { diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index b600a47619..5e6f9088fb 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -393,8 +393,40 @@ terminal: backend: local # or: docker, ssh, singularity, modal, daytona cwd: "." # Working directory ("." = current dir) timeout: 180 # Command timeout in seconds + + # Docker-specific settings + docker_image: "nikolaik/python-nodejs:python3.11-nodejs20" + docker_volumes: # Share host directories with the container + - "/home/user/projects:/workspace/projects" + - "/home/user/data:/data:ro" # :ro for read-only + + # Container resource limits (docker, singularity, modal, daytona) + container_cpu: 1 # CPU cores + container_memory: 5120 # MB (default 5GB) + container_disk: 51200 # MB (default 50GB) + container_persistent: true # Persist filesystem across sessions ``` +### Docker Volume Mounts + +When using the Docker backend, `docker_volumes` lets you share host directories with the container. Each entry uses standard Docker `-v` syntax: `host_path:container_path[:options]`. + +```yaml +terminal: + backend: docker + docker_volumes: + - "/home/user/projects:/workspace/projects" # Read-write (default) + - "/home/user/datasets:/data:ro" # Read-only + - "/home/user/outputs:/outputs" # Agent writes, you read +``` + +This is useful for: +- **Providing files** to the agent (datasets, configs, reference code) +- **Receiving files** from the agent (generated code, reports, exports) +- **Shared workspaces** where both you and the agent access the same files + +Can also be set via environment variable: `TERMINAL_DOCKER_VOLUMES='["/host:/container"]'` (JSON array). + See [Code Execution](features/code-execution.md) and the [Terminal section of the README](features/tools.md) for details on each backend. ## Memory Configuration From 2d44ed1c5b862ab0b674b576505b643a66fb225e Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 15:32:02 -0700 Subject: [PATCH 09/14] test: add comprehensive tests for vision_tools (42 tests) Covers PR #428 changes and existing vision_tools functionality: - _validate_image_url: 20 tests for urlparse-based validation - _determine_mime_type: 6 tests for MIME type detection - _image_to_base64_data_url: 3 tests for base64 conversion - _handle_vision_analyze: 5 tests for type hints, prompt building, AUXILIARY_VISION_MODEL env var override - Error logging exc_info: 3 async tests verifying stack traces are logged on download failure, analysis error, and cleanup error - check_vision_requirements & get_debug_session_info: 2 basic tests - Registry integration: 3 tests for tool registration --- tests/tools/test_vision_tools.py | 351 +++++++++++++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100644 tests/tools/test_vision_tools.py diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py new file mode 100644 index 0000000000..3bdd301786 --- /dev/null +++ b/tests/tools/test_vision_tools.py @@ -0,0 +1,351 @@ +"""Tests for tools/vision_tools.py — URL validation, type hints, error logging.""" + +import asyncio +import json +import logging +import os +from pathlib import Path +from typing import Awaitable +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from tools.vision_tools import ( + _validate_image_url, + _handle_vision_analyze, + _determine_mime_type, + _image_to_base64_data_url, + vision_analyze_tool, + check_vision_requirements, + get_debug_session_info, +) + + +# --------------------------------------------------------------------------- +# _validate_image_url — urlparse-based validation +# --------------------------------------------------------------------------- + +class TestValidateImageUrl: + """Tests for URL validation, including urlparse-based netloc check.""" + + def test_valid_https_url(self): + assert _validate_image_url("https://example.com/image.jpg") is True + + def test_valid_http_url(self): + assert _validate_image_url("http://cdn.example.org/photo.png") is True + + def test_valid_url_without_extension(self): + """CDN endpoints that redirect to images should still pass.""" + assert _validate_image_url("https://cdn.example.com/abcdef123") is True + + def test_valid_url_with_query_params(self): + assert _validate_image_url("https://img.example.com/pic?w=200&h=200") is True + + def test_valid_url_with_port(self): + assert _validate_image_url("http://localhost:8080/image.png") is True + + def test_valid_url_with_path_only(self): + assert _validate_image_url("https://example.com/") is True + + def test_rejects_empty_string(self): + assert _validate_image_url("") is False + + def test_rejects_none(self): + assert _validate_image_url(None) is False + + def test_rejects_non_string(self): + assert _validate_image_url(12345) is False + + def test_rejects_ftp_scheme(self): + assert _validate_image_url("ftp://files.example.com/image.jpg") is False + + def test_rejects_file_scheme(self): + assert _validate_image_url("file:///etc/passwd") is False + + def test_rejects_no_scheme(self): + assert _validate_image_url("example.com/image.jpg") is False + + def test_rejects_javascript_scheme(self): + assert _validate_image_url("javascript:alert(1)") is False + + def test_rejects_http_without_netloc(self): + """http:// alone has no network location — urlparse catches this.""" + assert _validate_image_url("http://") is False + + def test_rejects_https_without_netloc(self): + assert _validate_image_url("https://") is False + + def test_rejects_http_colon_only(self): + assert _validate_image_url("http:") is False + + def test_rejects_data_url(self): + assert _validate_image_url("data:image/png;base64,iVBOR") is False + + def test_rejects_whitespace_only(self): + assert _validate_image_url(" ") is False + + def test_rejects_boolean(self): + assert _validate_image_url(True) is False + + def test_rejects_list(self): + assert _validate_image_url(["https://example.com"]) is False + + +# --------------------------------------------------------------------------- +# _determine_mime_type +# --------------------------------------------------------------------------- + +class TestDetermineMimeType: + def test_jpg(self): + assert _determine_mime_type(Path("photo.jpg")) == "image/jpeg" + + def test_jpeg(self): + assert _determine_mime_type(Path("photo.jpeg")) == "image/jpeg" + + def test_png(self): + assert _determine_mime_type(Path("screenshot.png")) == "image/png" + + def test_gif(self): + assert _determine_mime_type(Path("anim.gif")) == "image/gif" + + def test_webp(self): + assert _determine_mime_type(Path("modern.webp")) == "image/webp" + + def test_unknown_extension_defaults_to_jpeg(self): + assert _determine_mime_type(Path("file.xyz")) == "image/jpeg" + + +# --------------------------------------------------------------------------- +# _image_to_base64_data_url +# --------------------------------------------------------------------------- + +class TestImageToBase64DataUrl: + def test_returns_data_url(self, tmp_path): + img = tmp_path / "test.png" + img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 8) + result = _image_to_base64_data_url(img) + assert result.startswith("data:image/png;base64,") + + def test_custom_mime_type(self, tmp_path): + img = tmp_path / "test.bin" + img.write_bytes(b"\x00" * 16) + result = _image_to_base64_data_url(img, mime_type="image/webp") + assert result.startswith("data:image/webp;base64,") + + def test_file_not_found_raises(self, tmp_path): + with pytest.raises(FileNotFoundError): + _image_to_base64_data_url(tmp_path / "nonexistent.png") + + +# --------------------------------------------------------------------------- +# _handle_vision_analyze — type signature & behavior +# --------------------------------------------------------------------------- + +class TestHandleVisionAnalyze: + """Verify _handle_vision_analyze returns an Awaitable and builds correct prompt.""" + + def test_returns_awaitable(self): + """The handler must return an Awaitable (coroutine) since it's registered as async.""" + with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool: + mock_tool.return_value = json.dumps({"result": "ok"}) + result = _handle_vision_analyze( + {"image_url": "https://example.com/img.png", "question": "What is this?"} + ) + # It should be an Awaitable (coroutine) + assert isinstance(result, Awaitable) + # Clean up the coroutine to avoid RuntimeWarning + result.close() + + def test_prompt_contains_question(self): + """The full prompt should incorporate the user's question.""" + with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool: + mock_tool.return_value = json.dumps({"result": "ok"}) + coro = _handle_vision_analyze( + {"image_url": "https://example.com/img.png", "question": "Describe the cat"} + ) + # Clean up coroutine + coro.close() + call_args = mock_tool.call_args + full_prompt = call_args[0][1] # second positional arg + assert "Describe the cat" in full_prompt + assert "Fully describe and explain" in full_prompt + + def test_uses_auxiliary_vision_model_env(self): + """AUXILIARY_VISION_MODEL env var should override DEFAULT_VISION_MODEL.""" + with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool, \ + patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "custom/model-v1"}): + mock_tool.return_value = json.dumps({"result": "ok"}) + coro = _handle_vision_analyze( + {"image_url": "https://example.com/img.png", "question": "test"} + ) + coro.close() + call_args = mock_tool.call_args + model = call_args[0][2] # third positional arg + assert model == "custom/model-v1" + + def test_falls_back_to_default_model(self): + """Without AUXILIARY_VISION_MODEL, should use DEFAULT_VISION_MODEL or fallback.""" + with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool, \ + patch.dict(os.environ, {}, clear=False): + # Ensure AUXILIARY_VISION_MODEL is not set + os.environ.pop("AUXILIARY_VISION_MODEL", None) + mock_tool.return_value = json.dumps({"result": "ok"}) + coro = _handle_vision_analyze( + {"image_url": "https://example.com/img.png", "question": "test"} + ) + coro.close() + call_args = mock_tool.call_args + model = call_args[0][2] + # Should be DEFAULT_VISION_MODEL or the hardcoded fallback + assert model is not None + assert len(model) > 0 + + def test_empty_args_graceful(self): + """Missing keys should default to empty strings, not raise.""" + with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool: + mock_tool.return_value = json.dumps({"result": "ok"}) + result = _handle_vision_analyze({}) + assert isinstance(result, Awaitable) + result.close() + + +# --------------------------------------------------------------------------- +# Error logging with exc_info — verify tracebacks are logged +# --------------------------------------------------------------------------- + +class TestErrorLoggingExcInfo: + """Verify that exc_info=True is used in error/warning log calls.""" + + @pytest.mark.asyncio + async def test_download_failure_logs_exc_info(self, tmp_path, caplog): + """After max retries, the download error should include exc_info.""" + from tools.vision_tools import _download_image + + with patch("tools.vision_tools.httpx.AsyncClient") as mock_client_cls: + mock_client = AsyncMock() + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client.get = AsyncMock(side_effect=ConnectionError("network down")) + mock_client_cls.return_value = mock_client + + dest = tmp_path / "image.jpg" + with caplog.at_level(logging.ERROR, logger="tools.vision_tools"), \ + pytest.raises(ConnectionError): + await _download_image("https://example.com/img.jpg", dest, max_retries=1) + + # Should have logged with exc_info (traceback present) + error_records = [r for r in caplog.records if r.levelno >= logging.ERROR] + assert len(error_records) >= 1 + assert error_records[0].exc_info is not None + + @pytest.mark.asyncio + async def test_analysis_error_logs_exc_info(self, caplog): + """When vision_analyze_tool encounters an error, it should log with exc_info.""" + with patch("tools.vision_tools._validate_image_url", return_value=True), \ + patch("tools.vision_tools._download_image", new_callable=AsyncMock, + side_effect=Exception("download boom")), \ + caplog.at_level(logging.ERROR, logger="tools.vision_tools"): + + result = await vision_analyze_tool( + "https://example.com/img.jpg", "describe this", "test/model" + ) + result_data = json.loads(result) + # Error response uses "success": False, not an "error" key + assert result_data["success"] is False + + error_records = [r for r in caplog.records if r.levelno >= logging.ERROR] + assert any(r.exc_info is not None for r in error_records) + + @pytest.mark.asyncio + async def test_cleanup_error_logs_exc_info(self, tmp_path, caplog): + """Temp file cleanup failure should log warning with exc_info.""" + # Create a real temp file that will be "downloaded" + temp_dir = tmp_path / "temp_vision_images" + temp_dir.mkdir() + + async def fake_download(url, dest, max_retries=3): + """Simulate download by writing file to the expected destination.""" + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(b"\xff\xd8\xff" + b"\x00" * 16) + return dest + + with patch("tools.vision_tools._validate_image_url", return_value=True), \ + patch("tools.vision_tools._download_image", side_effect=fake_download), \ + patch("tools.vision_tools._image_to_base64_data_url", + return_value="data:image/jpeg;base64,abc"), \ + patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None), \ + patch("agent.auxiliary_client.auxiliary_max_tokens_param", return_value={"max_tokens": 2000}), \ + caplog.at_level(logging.WARNING, logger="tools.vision_tools"): + + # Mock the vision client + mock_client = AsyncMock() + mock_response = MagicMock() + mock_choice = MagicMock() + mock_choice.message.content = "A test image description" + mock_response.choices = [mock_choice] + mock_client.chat.completions.create = AsyncMock(return_value=mock_response) + + # Patch module-level _aux_async_client so the tool doesn't bail early + with patch("tools.vision_tools._aux_async_client", mock_client), \ + patch("tools.vision_tools.DEFAULT_VISION_MODEL", "test/model"): + + # Make unlink fail to trigger cleanup warning + original_unlink = Path.unlink + def failing_unlink(self, *args, **kwargs): + raise PermissionError("no permission") + + with patch.object(Path, "unlink", failing_unlink): + result = await vision_analyze_tool( + "https://example.com/tempimg.jpg", "describe", "test/model" + ) + + warning_records = [r for r in caplog.records if r.levelno == logging.WARNING + and "temporary file" in r.getMessage().lower()] + assert len(warning_records) >= 1 + assert warning_records[0].exc_info is not None + + +# --------------------------------------------------------------------------- +# check_vision_requirements & get_debug_session_info +# --------------------------------------------------------------------------- + +class TestVisionRequirements: + def test_check_requirements_returns_bool(self): + result = check_vision_requirements() + assert isinstance(result, bool) + + def test_debug_session_info_returns_dict(self): + info = get_debug_session_info() + assert isinstance(info, dict) + # DebugSession.get_session_info() returns these keys + assert "enabled" in info + assert "session_id" in info + assert "total_calls" in info + + +# --------------------------------------------------------------------------- +# Integration: registry entry +# --------------------------------------------------------------------------- + +class TestVisionRegistration: + def test_vision_analyze_registered(self): + from tools.registry import registry + entry = registry._tools.get("vision_analyze") + assert entry is not None + assert entry.toolset == "vision" + assert entry.is_async is True + + def test_schema_has_required_fields(self): + from tools.registry import registry + entry = registry._tools.get("vision_analyze") + schema = entry.schema + assert schema["name"] == "vision_analyze" + params = schema.get("parameters", {}) + props = params.get("properties", {}) + assert "image_url" in props + assert "question" in props + + def test_handler_is_callable(self): + from tools.registry import registry + entry = registry._tools.get("vision_analyze") + assert callable(entry.handler) From ef5d811abac69725208a90062f2da6ac502ef3ea Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 15:36:19 -0700 Subject: [PATCH 10/14] fix: vision auto-detection now falls back to custom/local endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vision auto-mode previously only tried OpenRouter, Nous, and Codex for multimodal — deliberately skipping custom endpoints with the assumption they 'may not handle vision input.' This caused silent failures for users running local multimodal models (Qwen-VL, LLaVA, Pixtral, etc.) without any cloud API keys. Now custom endpoints are tried as a last resort in auto mode. If the model doesn't support vision, the API call fails gracefully — but users with local vision models no longer need to manually set auxiliary.vision.provider: main in config.yaml. Reported by @Spadav and @kotyKD. --- agent/auxiliary_client.py | 10 +++++++--- tests/agent/test_auxiliary_client.py | 14 +++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index a32e3a2937..57c3c11869 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -560,12 +560,16 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]: forced = _get_auxiliary_provider("vision") if forced != "auto": return _resolve_forced_provider(forced) - # Auto: only multimodal-capable providers - for try_fn in (_try_openrouter, _try_nous, _try_codex): + # Auto: try providers known to support multimodal first, then fall + # back to the user's custom endpoint. Many local models (Qwen-VL, + # LLaVA, Pixtral, etc.) support vision — skipping them entirely + # caused silent failures for local-only users. + for try_fn in (_try_openrouter, _try_nous, _try_codex, + _try_custom_endpoint): client, model = try_fn() if client is not None: return client, model - logger.debug("Auxiliary vision client: none available (auto only tries OpenRouter/Nous/Codex)") + logger.debug("Auxiliary vision client: none available") return None, None diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index 66187d0554..299d083f20 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -176,14 +176,18 @@ class TestVisionClientFallback: assert isinstance(client, CodexAuxiliaryClient) assert model == "gpt-5.3-codex" - def test_vision_auto_skips_custom_endpoint(self, monkeypatch): - """Custom endpoint is skipped in vision auto mode.""" + def test_vision_auto_falls_back_to_custom_endpoint(self, monkeypatch): + """Custom endpoint is used as fallback in vision auto mode. + + Many local models (Qwen-VL, LLaVA, etc.) support vision. + When no OpenRouter/Nous/Codex is available, try the custom endpoint. + """ monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:1234/v1") monkeypatch.setenv("OPENAI_API_KEY", "local-key") - with patch("agent.auxiliary_client._read_nous_auth", return_value=None): + with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ + patch("agent.auxiliary_client.OpenAI") as mock_openai: client, model = get_vision_auxiliary_client() - assert client is None - assert model is None + assert client is not None # Custom endpoint picked up as fallback def test_vision_uses_openrouter_when_available(self, monkeypatch): monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") From 9abd6bf342aa9e05339df53826b11610d102b39a Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 17:24:00 -0700 Subject: [PATCH 11/14] fix: gateway missing docker_volumes config bridge + list serialization bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gateway's config.yaml → env var bridge was missing docker_volumes, so Docker volume mounts configured in config.yaml were ignored for gateway sessions (Telegram, Discord, etc.) while working in CLI. Also fixes list serialization: str() produces Python repr with single quotes which json.loads() in terminal_tool.py can't parse. Now uses json.dumps() for list values. Based on PR #431 by @manuelschipper (applied manually due to stale branch). --- gateway/run.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gateway/run.py b/gateway/run.py index 2584521d12..6dd1a280a5 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -75,11 +75,16 @@ if _config_path.exists(): "container_memory": "TERMINAL_CONTAINER_MEMORY", "container_disk": "TERMINAL_CONTAINER_DISK", "container_persistent": "TERMINAL_CONTAINER_PERSISTENT", + "docker_volumes": "TERMINAL_DOCKER_VOLUMES", "sandbox_dir": "TERMINAL_SANDBOX_DIR", } for _cfg_key, _env_var in _terminal_env_map.items(): if _cfg_key in _terminal_cfg: - os.environ[_env_var] = str(_terminal_cfg[_cfg_key]) + _val = _terminal_cfg[_cfg_key] + if isinstance(_val, list): + os.environ[_env_var] = json.dumps(_val) + else: + os.environ[_env_var] = str(_val) _compression_cfg = _cfg.get("compression", {}) if _compression_cfg and isinstance(_compression_cfg, dict): _compression_env_map = { From 5212644861ffefe2a51b259692da564cf0d4aab7 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 17:33:19 -0700 Subject: [PATCH 12/14] fix(security): prevent shell injection in tilde-username path expansion Validate that the username portion of ~username paths contains only valid characters (alphanumeric, dot, hyphen, underscore) before passing to shell echo for expansion. Previously, paths like '~; rm -rf /' would be passed unquoted to self._exec(f'echo {path}'), allowing arbitrary command execution. The approach validates the username rather than using shlex.quote(), which would prevent tilde expansion from working at all since echo '~user' outputs the literal string instead of expanding it. Added tests for injection blocking and valid ~username/path expansion. Credit to @alireza78a for reporting (PR #442, issue #442). --- tests/tools/test_file_tools_live.py | 19 +++++++++++++++++++ tools/file_operations.py | 14 ++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/tests/tools/test_file_tools_live.py b/tests/tools/test_file_tools_live.py index 426b3543bb..72efbb2375 100644 --- a/tests/tools/test_file_tools_live.py +++ b/tests/tools/test_file_tools_live.py @@ -505,6 +505,25 @@ class TestExpandPath: assert result == str(Path.home()) _assert_clean(result) + def test_tilde_injection_blocked(self, ops): + """Paths like ~; rm -rf / must NOT execute shell commands.""" + malicious = "~; echo PWNED > /tmp/_hermes_injection_test" + result = ops._expand_path(malicious) + # The invalid username (contains ";") should prevent shell expansion. + # The path should be returned as-is (no expansion). + assert result == malicious + # Verify the injected command did NOT execute + import os + assert not os.path.exists("/tmp/_hermes_injection_test") + + def test_tilde_username_with_subpath(self, ops): + """~root/file.txt should attempt expansion (valid username).""" + result = ops._expand_path("~root/file.txt") + # On most systems ~root expands to /root + if result != "~root/file.txt": + assert result.endswith("/file.txt") + assert "~" not in result + # ── Terminal output cleanliness ────────────────────────────────────────── diff --git a/tools/file_operations.py b/tools/file_operations.py index 3f72c5fdb1..b3b8f15309 100644 --- a/tools/file_operations.py +++ b/tools/file_operations.py @@ -400,10 +400,16 @@ class ShellFileOperations(FileOperations): return home elif path.startswith('~/'): return home + path[1:] # Replace ~ with home - # ~username format - let shell expand it - expand_result = self._exec(f"echo {path}") - if expand_result.exit_code == 0: - return expand_result.stdout.strip() + # ~username format - extract and validate username before + # letting shell expand it (prevent shell injection via + # paths like "~; rm -rf /"). + rest = path[1:] # strip leading ~ + slash_idx = rest.find('/') + username = rest[:slash_idx] if slash_idx >= 0 else rest + if username and re.fullmatch(r'[a-zA-Z0-9._-]+', username): + expand_result = self._exec(f"echo {path}") + if expand_result.exit_code == 0 and expand_result.stdout.strip(): + return expand_result.stdout.strip() return path From 8eabdefa8ac26b2ae799882c37bea91a50296d6e Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 17:45:50 -0700 Subject: [PATCH 13/14] fix: bring WebResearchEnv up to Atropos environment standards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The environment was merged missing several standard components. Updated to match the patterns established by 82 Atropos environments and our own HermesAgentBaseEnv contract. Added: - WebResearchEnvConfig — custom Pydantic config with reward weights, efficiency thresholds, eval settings, dataset config (all tunable via CLI/YAML without code changes) - config_init() classmethod — default server config (OpenRouter + Claude) so the env works out of the box - wandb_log() override — logs reward breakdown metrics (correctness, tool_usage, efficiency, diversity, correct_rate, tool_usage_rate) with proper buffer management and super() call - evaluate() — uses server.chat_completion instead of broken stub _run_agent_on_item(). Logs via evaluate_log() for lighteval- compatible output. Fixed: - Removed broken _run_agent_on_item() stub that returned empty results - evaluate() now uses server.chat_completion (same pattern as TerminalTestEnv) for actual model evaluation - compute_reward reads tool calls from AgentResult properly - LLM judge uses self.server.chat_completion instead of ctx Reward config is now tunable without code changes: --env.correctness_weight 0.6 --env.tool_usage_weight 0.2 --env.efficiency_weight 0.2 --env.diversity_bonus 0.1 --env.efficient_max_calls 5 --- environments/web_research_env.py | 414 ++++++++++++++++++++----------- 1 file changed, 270 insertions(+), 144 deletions(-) diff --git a/environments/web_research_env.py b/environments/web_research_env.py index e73eb45c6d..a868cd034e 100644 --- a/environments/web_research_env.py +++ b/environments/web_research_env.py @@ -16,21 +16,18 @@ Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions Usage: # Phase 1 (OpenAI-compatible server) - python environments/web_research_env.py serve \ - --openai.base_url http://localhost:8000/v1 \ - --openai.model_name YourModel \ + python environments/web_research_env.py serve \\ + --openai.base_url http://localhost:8000/v1 \\ + --openai.model_name YourModel \\ --openai.server_type openai - # With eval split - python environments/web_research_env.py serve \ - --openai.base_url http://localhost:8000/v1 \ - --openai.model_name YourModel \ - --env.eval_every 50 \ - --env.eval_size 20 + # Process mode (offline data generation) + python environments/web_research_env.py process \\ + --env.data_path_to_save_groups data/web_research.jsonl - # Standalone eval (no training server needed) - python environments/web_research_env.py eval \ - --openai.base_url http://localhost:8000/v1 \ + # Standalone eval + python environments/web_research_env.py evaluate \\ + --openai.base_url http://localhost:8000/v1 \\ --openai.model_name YourModel Built by: github.com/jackx707 @@ -43,11 +40,21 @@ from __future__ import annotations import asyncio import json import logging +import os import random import re -from typing import Any, Optional +import sys +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlparse +from pydantic import Field + +# Ensure hermes-agent root is on path +_repo_root = Path(__file__).resolve().parent.parent +if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + # --------------------------------------------------------------------------- # Optional HuggingFace datasets import # --------------------------------------------------------------------------- @@ -57,13 +64,19 @@ try: except ImportError: HF_AVAILABLE = False -from environments.hermes_base_env import HermesAgentBaseEnv +from atroposlib.envs.base import ScoredDataGroup +from atroposlib.envs.server_handling.server_manager import APIServerConfig +from atroposlib.type_definitions import Item + +from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig +from environments.agent_loop import AgentResult +from environments.tool_context import ToolContext logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Fallback sample dataset (used when HuggingFace is unavailable) -# These are multi-hop questions that require real web search to answer. +# Multi-hop questions requiring real web search to answer. # --------------------------------------------------------------------------- SAMPLE_QUESTIONS = [ { @@ -129,6 +142,58 @@ SAMPLE_QUESTIONS = [ ] +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +class WebResearchEnvConfig(HermesAgentEnvConfig): + """Configuration for the web research RL environment.""" + + # Reward weights + correctness_weight: float = Field( + default=0.6, + description="Weight for answer correctness in reward (LLM judge score).", + ) + tool_usage_weight: float = Field( + default=0.2, + description="Weight for tool usage signal (did the model actually use web tools?).", + ) + efficiency_weight: float = Field( + default=0.2, + description="Weight for efficiency signal (penalizes excessive tool calls).", + ) + diversity_bonus: float = Field( + default=0.1, + description="Bonus reward for citing ≥2 distinct domains.", + ) + + # Efficiency thresholds + efficient_max_calls: int = Field( + default=5, + description="Maximum tool calls before efficiency penalty begins.", + ) + heavy_penalty_calls: int = Field( + default=10, + description="Tool call count where efficiency penalty steepens.", + ) + + # Eval + eval_size: int = Field( + default=20, + description="Number of held-out items for evaluation.", + ) + eval_split_ratio: float = Field( + default=0.1, + description="Fraction of dataset to hold out for evaluation (0.0–1.0).", + ) + + # Dataset + dataset_name: str = Field( + default="google/frames-benchmark", + description="HuggingFace dataset name for research questions.", + ) + + # --------------------------------------------------------------------------- # Environment # --------------------------------------------------------------------------- @@ -143,23 +208,60 @@ class WebResearchEnv(HermesAgentBaseEnv): Reward is multi-signal: 60% — answer correctness (LLM judge) 20% — tool usage (did the model actually search the web?) - 20% — efficiency (penalizes >6 tool calls) + 20% — efficiency (penalizes >5 tool calls) Bonus +0.1 for source diversity (≥2 distinct domains cited). """ name = "web-research" + env_config_cls = WebResearchEnvConfig # Default toolsets for this environment — web + file for saving notes default_toolsets = ["web", "file"] + @classmethod + def config_init(cls) -> Tuple[WebResearchEnvConfig, List[APIServerConfig]]: + """Default configuration for the web research environment.""" + env_config = WebResearchEnvConfig( + enabled_toolsets=["web", "file"], + max_agent_turns=15, + agent_temperature=1.0, + system_prompt=( + "You are a highly capable research agent. When asked a factual question, " + "always use web_search to find current, accurate information before answering. " + "Cite at least 2 sources. Be concise and accurate." + ), + group_size=4, + total_steps=1000, + steps_per_eval=100, + use_wandb=True, + wandb_name="web-research", + ) + + server_configs = [ + APIServerConfig( + base_url="https://openrouter.ai/api/v1", + model_name="anthropic/claude-sonnet-4.5", + server_type="openai", + api_key=os.getenv("OPENROUTER_API_KEY", ""), + health_check=False, + ) + ] + + return env_config, server_configs + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._items: list[dict] = [] self._eval_items: list[dict] = [] self._index: int = 0 - self._total_scored: int = 0 - self._total_reward: float = 0.0 + + # Metrics tracking for wandb + self._reward_buffer: list[float] = [] + self._correctness_buffer: list[float] = [] + self._tool_usage_buffer: list[float] = [] + self._efficiency_buffer: list[float] = [] + self._diversity_buffer: list[float] = [] # ------------------------------------------------------------------ # 1. Setup — load dataset @@ -170,7 +272,7 @@ class WebResearchEnv(HermesAgentBaseEnv): if HF_AVAILABLE: try: logger.info("Loading FRAMES benchmark from HuggingFace...") - ds = load_dataset("google/frames-benchmark", split="test") + ds = load_dataset(self.config.dataset_name, split="test") self._items = [ { "question": row["Prompt"], @@ -180,8 +282,11 @@ class WebResearchEnv(HermesAgentBaseEnv): } for row in ds ] - # Hold out 10% for eval - eval_size = max(20, len(self._items) // 10) + # Hold out for eval + eval_size = max( + self.config.eval_size, + int(len(self._items) * self.config.eval_split_ratio), + ) random.shuffle(self._items) self._eval_items = self._items[:eval_size] self._items = self._items[eval_size:] @@ -220,10 +325,7 @@ class WebResearchEnv(HermesAgentBaseEnv): # ------------------------------------------------------------------ def format_prompt(self, item: dict) -> str: - """ - Format the research question as a task prompt. - Instructs the model to use web search and cite sources. - """ + """Format the research question as a task prompt.""" return ( f"Research the following question thoroughly using web search. " f"You MUST search the web to find current, accurate information — " @@ -243,27 +345,30 @@ class WebResearchEnv(HermesAgentBaseEnv): async def compute_reward( self, item: dict, - result: dict, - ctx: Any, # ToolContext + result: AgentResult, + ctx: ToolContext, ) -> float: """ Multi-signal reward function: - 0.6 * correctness — LLM judge comparing answer to ground truth - 0.2 * tool_used — binary: did the model use web tools? - 0.2 * efficiency — penalizes wasteful tool usage - +0.1 bonus — source diversity (≥2 distinct domains) + correctness_weight * correctness — LLM judge comparing answer to ground truth + tool_usage_weight * tool_used — binary: did the model use web tools? + efficiency_weight * efficiency — penalizes wasteful tool usage + + diversity_bonus — source diversity (≥2 distinct domains) """ - final_response: str = result.get("final_response", "") - tools_used: list[str] = result.get("tools_used", []) - tool_call_count: int = result.get("tool_call_count", len(tools_used)) + final_response: str = result.final_response or "" + tools_used: list[str] = [ + tc.tool_name for tc in (result.tool_calls or []) + ] if hasattr(result, "tool_calls") and result.tool_calls else [] + tool_call_count: int = result.turns_used or len(tools_used) + + cfg = self.config # ---- Signal 1: Answer correctness (LLM judge) ---------------- correctness = await self._llm_judge( question=item["question"], expected=item["answer"], model_answer=final_response, - ctx=ctx, ) # ---- Signal 2: Web tool usage -------------------------------- @@ -271,35 +376,37 @@ class WebResearchEnv(HermesAgentBaseEnv): tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0 # ---- Signal 3: Efficiency ------------------------------------ - # Ideal: 2-5 tool calls. Penalise beyond 6, hard cap at 15. - if tool_call_count <= 5: + if tool_call_count <= cfg.efficient_max_calls: efficiency = 1.0 - elif tool_call_count <= 10: - efficiency = 1.0 - (tool_call_count - 5) * 0.08 + elif tool_call_count <= cfg.heavy_penalty_calls: + efficiency = 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.08 else: - efficiency = max(0.0, 1.0 - (tool_call_count - 5) * 0.12) + efficiency = max(0.0, 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.12) # ---- Bonus: Source diversity --------------------------------- domains = self._extract_domains(final_response) - diversity_bonus = 0.1 if len(domains) >= 2 else 0.0 + diversity = cfg.diversity_bonus if len(domains) >= 2 else 0.0 # ---- Combine ------------------------------------------------ reward = ( - 0.6 * correctness - + 0.2 * tool_used - + 0.2 * efficiency - + diversity_bonus + cfg.correctness_weight * correctness + + cfg.tool_usage_weight * tool_used + + cfg.efficiency_weight * efficiency + + diversity ) reward = min(1.0, max(0.0, reward)) # clamp to [0, 1] - # Track running stats - self._total_scored += 1 - self._total_reward += reward + # Track for wandb + self._reward_buffer.append(reward) + self._correctness_buffer.append(correctness) + self._tool_usage_buffer.append(tool_used) + self._efficiency_buffer.append(efficiency) + self._diversity_buffer.append(diversity) logger.debug( f"Reward breakdown — correctness={correctness:.2f}, " f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, " - f"diversity_bonus={diversity_bonus:.1f} → total={reward:.3f}" + f"diversity={diversity:.1f} → total={reward:.3f}" ) return reward @@ -308,68 +415,117 @@ class WebResearchEnv(HermesAgentBaseEnv): # 5. evaluate — run on held-out eval split # ------------------------------------------------------------------ - async def evaluate( - self, - *args: Any, - eval_size: Optional[int] = None, - **kwargs: Any, - ) -> dict: - """ - Run evaluation on the held-out split. - Returns a dict of metrics for logging. - """ - items = self._eval_items - if eval_size: - items = items[:eval_size] + async def evaluate(self, *args, **kwargs) -> None: + """Run evaluation on the held-out split using the agent loop.""" + import time + items = self._eval_items if not items: logger.warning("No eval items available.") - return {} + return - logger.info(f"Running eval on {len(items)} questions...") + eval_size = min(self.config.eval_size, len(items)) + eval_items = items[:eval_size] - rewards = [] - correctness_scores = [] + logger.info(f"Running eval on {len(eval_items)} questions...") + start_time = time.time() + samples = [] - for item in items: + for item in eval_items: try: - # Run the agent on each eval question - result = await self._run_agent_on_item(item) - reward = await self.compute_reward(item, result, ctx=None) - rewards.append(reward) + # Use the base env's agent loop for eval (same as training) + prompt = self.format_prompt(item) + completion = await self.server.chat_completion( + messages=[ + {"role": "system", "content": self.config.system_prompt or ""}, + {"role": "user", "content": prompt}, + ], + n=1, + max_tokens=self.config.max_token_length, + temperature=0.0, + split="eval", + ) + + response_content = ( + completion.choices[0].message.content if completion.choices else "" + ) + + # Score the response + correctness = await self._llm_judge( + question=item["question"], + expected=item["answer"], + model_answer=response_content, + ) + + samples.append({ + "prompt": item["question"], + "response": response_content, + "expected": item["answer"], + "correctness": correctness, + }) - # Also track raw correctness separately - if result.get("final_response"): - correctness_scores.append( - await self._llm_judge( - question=item["question"], - expected=item["answer"], - model_answer=result["final_response"], - ctx=None, - ) - ) except Exception as e: logger.error(f"Eval error on item: {e}") - rewards.append(0.0) + samples.append({ + "prompt": item["question"], + "response": f"ERROR: {e}", + "expected": item["answer"], + "correctness": 0.0, + }) - metrics = { - "eval/mean_reward": sum(rewards) / len(rewards) if rewards else 0.0, + end_time = time.time() + + # Compute metrics + correctness_scores = [s["correctness"] for s in samples] + eval_metrics = { "eval/mean_correctness": ( sum(correctness_scores) / len(correctness_scores) if correctness_scores else 0.0 ), - "eval/n_items": len(rewards), - "train/mean_reward_so_far": ( - self._total_reward / self._total_scored - if self._total_scored > 0 else 0.0 - ), + "eval/n_items": len(samples), } - logger.info( - f"Eval complete — mean_reward={metrics['eval/mean_reward']:.3f}, " - f"mean_correctness={metrics['eval/mean_correctness']:.3f}" + await self.evaluate_log( + metrics=eval_metrics, + samples=samples, + start_time=start_time, + end_time=end_time, ) - return metrics + + # ------------------------------------------------------------------ + # 6. wandb_log — custom metrics + # ------------------------------------------------------------------ + + async def wandb_log(self, wandb_metrics: Optional[Dict] = None) -> None: + """Log reward breakdown metrics to wandb.""" + if wandb_metrics is None: + wandb_metrics = {} + + if self._reward_buffer: + n = len(self._reward_buffer) + wandb_metrics["train/mean_reward"] = sum(self._reward_buffer) / n + wandb_metrics["train/mean_correctness"] = sum(self._correctness_buffer) / n + wandb_metrics["train/mean_tool_usage"] = sum(self._tool_usage_buffer) / n + wandb_metrics["train/mean_efficiency"] = sum(self._efficiency_buffer) / n + wandb_metrics["train/mean_diversity"] = sum(self._diversity_buffer) / n + wandb_metrics["train/total_rollouts"] = n + + # Accuracy buckets + wandb_metrics["train/correct_rate"] = ( + sum(1 for c in self._correctness_buffer if c >= 0.7) / n + ) + wandb_metrics["train/tool_usage_rate"] = ( + sum(1 for t in self._tool_usage_buffer if t > 0) / n + ) + + # Clear buffers + self._reward_buffer.clear() + self._correctness_buffer.clear() + self._tool_usage_buffer.clear() + self._efficiency_buffer.clear() + self._diversity_buffer.clear() + + await super().wandb_log(wandb_metrics) # ------------------------------------------------------------------ # Private helpers @@ -380,19 +536,14 @@ class WebResearchEnv(HermesAgentBaseEnv): question: str, expected: str, model_answer: str, - ctx: Any, ) -> float: """ - Use an LLM to judge whether `model_answer` correctly addresses - `question` compared to `expected`. Returns a float in [0, 1]. - - Uses the agent's own inference client if ctx is available, - otherwise falls back to a lightweight heuristic. + Use the server's LLM to judge answer correctness. + Falls back to keyword heuristic if LLM call fails. """ if not model_answer or not model_answer.strip(): return 0.0 - # Build judge prompt judge_prompt = ( "You are an impartial judge evaluating the quality of an AI research answer.\n\n" f"Question: {question}\n\n" @@ -405,39 +556,36 @@ class WebResearchEnv(HermesAgentBaseEnv): " 0.1 = mentions relevant topic but wrong or very incomplete\n" " 0.0 = completely wrong or no answer\n\n" "Consider: factual accuracy, completeness, and relevance.\n" - "Respond with ONLY a JSON object: {\"score\": , \"reason\": \"\"}" + 'Respond with ONLY a JSON object: {"score": , "reason": ""}' ) - # Try using ctx for inference (Phase 2 / live training) - if ctx is not None and hasattr(ctx, "chat_completion"): - try: - response = await ctx.chat_completion( - messages=[{"role": "user", "content": judge_prompt}], - max_tokens=100, - temperature=0.0, - ) - text = response.get("content", "") - parsed = self._parse_judge_json(text) - if parsed is not None: - return float(parsed) - except Exception as e: - logger.debug(f"LLM judge via ctx failed: {e}. Using heuristic.") + try: + response = await self.server.chat_completion( + messages=[{"role": "user", "content": judge_prompt}], + n=1, + max_tokens=150, + temperature=0.0, + split="eval", + ) + text = response.choices[0].message.content if response.choices else "" + parsed = self._parse_judge_json(text) + if parsed is not None: + return float(parsed) + except Exception as e: + logger.debug(f"LLM judge failed: {e}. Using heuristic.") - # Fallback: keyword overlap heuristic return self._heuristic_score(expected, model_answer) @staticmethod def _parse_judge_json(text: str) -> Optional[float]: """Extract the score float from LLM judge JSON response.""" try: - # Strip markdown code fences if present clean = re.sub(r"```(?:json)?|```", "", text).strip() data = json.loads(clean) score = float(data.get("score", -1)) if 0.0 <= score <= 1.0: return score except Exception: - # Try regex fallback match = re.search(r'"score"\s*:\s*([0-9.]+)', text) if match: score = float(match.group(1)) @@ -447,10 +595,7 @@ class WebResearchEnv(HermesAgentBaseEnv): @staticmethod def _heuristic_score(expected: str, model_answer: str) -> float: - """ - Lightweight keyword overlap score as fallback when no LLM is available. - Extracts meaningful tokens and computes Jaccard similarity. - """ + """Lightweight keyword overlap score as fallback.""" stopwords = { "the", "a", "an", "is", "are", "was", "were", "of", "in", "on", "at", "to", "for", "with", "and", "or", "but", "it", "its", @@ -458,35 +603,30 @@ class WebResearchEnv(HermesAgentBaseEnv): } def tokenize(text: str) -> set: - tokens = re.findall(r'\b[a-zA-Z0-9]+\b', text.lower()) + tokens = re.findall(r'\b\w+\b', text.lower()) return {t for t in tokens if t not in stopwords and len(t) > 2} expected_tokens = tokenize(expected) answer_tokens = tokenize(model_answer) if not expected_tokens: - return 0.5 # Can't judge + return 0.5 overlap = len(expected_tokens & answer_tokens) union = len(expected_tokens | answer_tokens) jaccard = overlap / union if union > 0 else 0.0 - # Recall-weighted: reward covering expected content recall = overlap / len(expected_tokens) return min(1.0, 0.4 * jaccard + 0.6 * recall) @staticmethod def _extract_domains(text: str) -> set: - """ - Extract unique domains from URLs cited in the response. - Used to measure source diversity. - """ + """Extract unique domains from URLs cited in the response.""" urls = re.findall(r'https?://[^\s\)>\]"\']+', text) domains = set() for url in urls: try: parsed = urlparse(url) - # Normalize: strip www. domain = parsed.netloc.lower().lstrip("www.") if domain: domains.add(domain) @@ -494,20 +634,6 @@ class WebResearchEnv(HermesAgentBaseEnv): pass return domains - async def _run_agent_on_item(self, item: dict) -> dict: - """ - Stub for running agent during eval. In Phase 1/2, this is handled - by the Atropos framework's rollout mechanism. Provided here for - standalone eval compatibility. - """ - # In real usage, the framework calls get_next_item + format_prompt - # and runs the agent. This stub returns an empty result for safety. - return { - "final_response": "", - "tools_used": [], - "tool_call_count": 0, - } - # --------------------------------------------------------------------------- # Entry point From 172a38c344a372296ea995258d2251be4245ba04 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 17:52:33 -0700 Subject: [PATCH 14/14] fix: Docker persistent bind mounts fail with Permission denied MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cap-drop ALL removes DAC_OVERRIDE, which root needs to write to bind-mounted directories owned by the host user (uid 1000). This broke persistent Docker sandboxes — the container couldn't write to /workspace or /root. Add back the minimum capabilities needed: - DAC_OVERRIDE: root can write to bind-mounted dirs owned by host user - CHOWN: package managers (pip, npm, apt) need to set file ownership - FOWNER: needed for operations on files owned by other users Still drops all other capabilities (NET_RAW, SYS_ADMIN, etc.) and keeps no-new-privileges. Security boundary is the container itself. Verified end-to-end: create files → destroy container → new container with same task_id → files persist on host and are accessible in the new container. --- tools/environments/docker.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/environments/docker.py b/tools/environments/docker.py index 85184fde7c..faf01b2a25 100644 --- a/tools/environments/docker.py +++ b/tools/environments/docker.py @@ -22,10 +22,16 @@ logger = logging.getLogger(__name__) # Security flags applied to every container. # The container itself is the security boundary (isolated from host). -# We drop all capabilities, block privilege escalation, and limit PIDs. +# We drop all capabilities then add back the minimum needed: +# DAC_OVERRIDE - root can write to bind-mounted dirs owned by host user +# CHOWN/FOWNER - package managers (pip, npm, apt) need to set file ownership +# Block privilege escalation and limit PIDs. # /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds). _SECURITY_ARGS = [ "--cap-drop", "ALL", + "--cap-add", "DAC_OVERRIDE", + "--cap-add", "CHOWN", + "--cap-add", "FOWNER", "--security-opt", "no-new-privileges", "--pids-limit", "256", "--tmpfs", "/tmp:rw,nosuid,size=512m",