From 7d79ce92ac22c85072981ef28e84abc82b2c679b Mon Sep 17 00:00:00 2001
From: aydnOktay <xaydinoktay@gmail.com>
Date: Thu, 5 Mar 2026 16:11:59 +0300
Subject: [PATCH 01/14] Improve type hints and error diagnostics in
 vision_tools

---
 tools/vision_tools.py | 48 ++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/tools/vision_tools.py b/tools/vision_tools.py
index 456f85583d..0b6d11194d 100644
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@@ -27,14 +27,15 @@ Usage:
     )
 """
 
+import asyncio
+import base64
 import json
 import logging
 import os
-import asyncio
 import uuid
-import base64
 from pathlib import Path
-from typing import Dict, Any, Optional
+from typing import Any, Awaitable, Dict, Optional
+from urllib.parse import urlparse
 import httpx
 from openai import AsyncOpenAI
 from agent.auxiliary_client import get_vision_auxiliary_client
@@ -73,15 +74,18 @@ def _validate_image_url(url: str) -> bool:
     """
     if not url or not isinstance(url, str):
         return False
-    
-    # Check if it's a valid URL format
-    if not (url.startswith('http://') or url.startswith('https://')):
+
+    # Basic HTTP/HTTPS URL check
+    if not (url.startswith("http://") or url.startswith("https://")):
         return False
-    
-    # Check for common image extensions (optional, as URLs may not have extensions)
-    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg']
-    
-    return True  # Allow all HTTP/HTTPS URLs for flexibility
+
+    # Parse to ensure we at least have a network location; still allow URLs
+    # without file extensions (e.g. CDN endpoints that redirect to images).
+    parsed = urlparse(url)
+    if not parsed.netloc:
+        return False
+
+    return True  # Allow all well-formed HTTP/HTTPS URLs for flexibility
 
 
 async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path:
@@ -131,7 +135,12 @@ async def _download_image(image_url: str, destination: Path, max_retries: int =
                 logger.warning("Retrying in %ss...", wait_time)
                 await asyncio.sleep(wait_time)
             else:
-                logger.error("Image download failed after %s attempts: %s", max_retries, str(e)[:100])
+                logger.error(
+                    "Image download failed after %s attempts: %s",
+                    max_retries,
+                    str(e)[:100],
+                    exc_info=True,
+                )
     
     raise last_error
 
@@ -188,7 +197,7 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
 async def vision_analyze_tool(
     image_url: str,
     user_prompt: str,
-    model: str = DEFAULT_VISION_MODEL
+    model: str = DEFAULT_VISION_MODEL,
 ) -> str:
     """
     Analyze an image from a URL or local file path using vision AI.
@@ -347,7 +356,7 @@ async def vision_analyze_tool(
         
     except Exception as e:
         error_msg = f"Error analyzing image: {str(e)}"
-        logger.error("%s", error_msg)
+        logger.error("%s", error_msg, exc_info=True)
         
         # Prepare error response
         result = {
@@ -368,7 +377,9 @@ async def vision_analyze_tool(
                 temp_image_path.unlink()
                 logger.debug("Cleaned up temporary image file")
             except Exception as cleanup_error:
-                logger.warning("Could not delete temporary file: %s", cleanup_error)
+                logger.warning(
+                    "Could not delete temporary file: %s", cleanup_error, exc_info=True
+                )
 
 
 def check_vision_requirements() -> bool:
@@ -464,10 +475,13 @@ VISION_ANALYZE_SCHEMA = {
 }
 
 
-def _handle_vision_analyze(args, **kw):
+def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
     image_url = args.get("image_url", "")
     question = args.get("question", "")
-    full_prompt = f"Fully describe and explain everything about this image, then answer the following question:\n\n{question}"
+    full_prompt = (
+        "Fully describe and explain everything about this image, then answer the "
+        f"following question:\n\n{question}"
+    )
     model = DEFAULT_VISION_MODEL or "google/gemini-3-flash-preview"
     return vision_analyze_tool(image_url, full_prompt, model)
 

From 15561ec425a74f26bd2051f562d60ec43f78a050 Mon Sep 17 00:00:00 2001
From: jackx707 <your@email.com>
Date: Thu, 5 Mar 2026 14:34:36 +0000
Subject: [PATCH 02/14] feat: add WebResearchEnv RL environment for multi-step
 web research

---
 datagen-config-examples/web_research.yaml |  46 ++
 environments/web_research_env.py          | 517 ++++++++++++++++++++++
 2 files changed, 563 insertions(+)
 create mode 100644 datagen-config-examples/web_research.yaml
 create mode 100644 environments/web_research_env.py

diff --git a/datagen-config-examples/web_research.yaml b/datagen-config-examples/web_research.yaml
new file mode 100644
index 0000000000..6275dbed69
--- /dev/null
+++ b/datagen-config-examples/web_research.yaml
@@ -0,0 +1,46 @@
+# datagen-config-examples/web_research.yaml
+#
+# Batch data generation config for WebResearchEnv.
+# Generates tool-calling trajectories for multi-step web research tasks.
+#
+# Usage:
+#   python batch_runner.py \
+#     --config datagen-config-examples/web_research.yaml \
+#     --run_name web_research_v1
+
+environment: web-research
+
+# Toolsets available to the agent during data generation
+toolsets:
+  - web
+  - file
+
+# How many parallel workers to use
+num_workers: 4
+
+# Questions per batch
+batch_size: 20
+
+# Total trajectories to generate (comment out to run full dataset)
+max_items: 500
+
+# Model to use for generation (override with --model flag)
+model: openrouter/nousresearch/hermes-3-llama-3.1-405b
+
+# System prompt additions (ephemeral — not saved to trajectories)
+ephemeral_system_prompt: |
+  You are a highly capable research agent. When asked a factual question,
+  always use web_search to find current, accurate information before answering.
+  Cite at least 2 sources. Be concise and accurate.
+
+# Output directory
+output_dir: data/web_research_v1
+
+# Trajectory compression settings (for fitting into training token budgets)
+compression:
+  enabled: true
+  target_max_tokens: 16000
+
+# Eval settings
+eval_every: 100       # Run eval every N trajectories
+eval_size: 25         # Number of held-out questions per eval run
diff --git a/environments/web_research_env.py b/environments/web_research_env.py
new file mode 100644
index 0000000000..e73eb45c6d
--- /dev/null
+++ b/environments/web_research_env.py
@@ -0,0 +1,517 @@
+"""
+WebResearchEnv — RL Environment for Multi-Step Web Research
+============================================================
+
+Trains models to do accurate, efficient, multi-source web research.
+
+Reward signals:
+  - Answer correctness  (LLM judge, 0.0–1.0)
+  - Source diversity    (used ≥2 distinct domains)
+  - Efficiency          (penalizes excessive tool calls)
+  - Tool usage          (bonus for actually using web tools)
+
+Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions
+  HuggingFace: google/frames-benchmark
+  Fallback:    built-in sample questions (no HF token needed)
+
+Usage:
+    # Phase 1 (OpenAI-compatible server)
+    python environments/web_research_env.py serve \
+        --openai.base_url http://localhost:8000/v1 \
+        --openai.model_name YourModel \
+        --openai.server_type openai
+
+    # With eval split
+    python environments/web_research_env.py serve \
+        --openai.base_url http://localhost:8000/v1 \
+        --openai.model_name YourModel \
+        --env.eval_every 50 \
+        --env.eval_size 20
+
+    # Standalone eval (no training server needed)
+    python environments/web_research_env.py eval \
+        --openai.base_url http://localhost:8000/v1 \
+        --openai.model_name YourModel
+
+Built by: github.com/jackx707
+Inspired by: GroceryMind — production Hermes agent doing live web research
+             across German grocery stores (firecrawl + hermes-agent)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import random
+import re
+from typing import Any, Optional
+from urllib.parse import urlparse
+
+# ---------------------------------------------------------------------------
+# Optional HuggingFace datasets import
+# ---------------------------------------------------------------------------
+try:
+    from datasets import load_dataset
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+
+from environments.hermes_base_env import HermesAgentBaseEnv
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Fallback sample dataset (used when HuggingFace is unavailable)
+# These are multi-hop questions that require real web search to answer.
+# ---------------------------------------------------------------------------
+SAMPLE_QUESTIONS = [
+    {
+        "question": "What is the current population of the capital city of the country that won the 2022 FIFA World Cup?",
+        "answer": "Buenos Aires has approximately 3 million people in the city proper, or around 15 million in the greater metro area.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "Who is the CEO of the company that makes the most widely used open-source container orchestration platform?",
+        "answer": "The Linux Foundation oversees Kubernetes. CNCF (Cloud Native Computing Foundation) is the specific body — it does not have a traditional CEO but has an executive director.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "What programming language was used to write the original version of the web framework used by Instagram?",
+        "answer": "Django, which Instagram was built on, is written in Python.",
+        "difficulty": "easy",
+        "hops": 2,
+    },
+    {
+        "question": "In what year was the university founded where the inventor of the World Wide Web currently holds a professorship?",
+        "answer": "Tim Berners-Lee holds a professorship at MIT (founded 1861) and the University of Southampton (founded 1952).",
+        "difficulty": "hard",
+        "hops": 3,
+    },
+    {
+        "question": "What is the latest stable version of the programming language that ranks #1 on the TIOBE index as of this year?",
+        "answer": "Python is currently #1 on TIOBE. The latest stable version should be verified via the official python.org site.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "How many employees does the parent company of Instagram have?",
+        "answer": "Meta Platforms (parent of Instagram) employs approximately 70,000+ people as of recent reports.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "What is the current interest rate set by the central bank of the country where the Eiffel Tower is located?",
+        "answer": "The European Central Bank sets rates for France/eurozone. The current rate should be verified — it has changed frequently in 2023-2025.",
+        "difficulty": "hard",
+        "hops": 2,
+    },
+    {
+        "question": "Which company acquired the startup founded by the creator of Oculus VR?",
+        "answer": "Palmer Luckey founded Oculus VR, which was acquired by Facebook (now Meta). He later founded Anduril Industries.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "What is the market cap of the company that owns the most popular search engine in Russia?",
+        "answer": "Yandex (now split into separate entities after 2024 restructuring). Current market cap should be verified via financial sources.",
+        "difficulty": "hard",
+        "hops": 2,
+    },
+    {
+        "question": "What was the GDP growth rate of the country that hosted the most recent Summer Olympics?",
+        "answer": "Paris, France hosted the 2024 Summer Olympics. France's recent GDP growth should be verified via World Bank or IMF data.",
+        "difficulty": "hard",
+        "hops": 2,
+    },
+]
+
+
+# ---------------------------------------------------------------------------
+# Environment
+# ---------------------------------------------------------------------------
+
+class WebResearchEnv(HermesAgentBaseEnv):
+    """
+    RL environment for training multi-step web research skills.
+
+    The model is given a factual question requiring 2-3 hops of web research
+    and must use web_search / web_extract tools to find and synthesize the answer.
+
+    Reward is multi-signal:
+      60% — answer correctness (LLM judge)
+      20% — tool usage (did the model actually search the web?)
+      20% — efficiency (penalizes >6 tool calls)
+
+    Bonus +0.1 for source diversity (≥2 distinct domains cited).
+    """
+
+    name = "web-research"
+
+    # Default toolsets for this environment — web + file for saving notes
+    default_toolsets = ["web", "file"]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._items: list[dict] = []
+        self._eval_items: list[dict] = []
+        self._index: int = 0
+        self._total_scored: int = 0
+        self._total_reward: float = 0.0
+
+    # ------------------------------------------------------------------
+    # 1. Setup — load dataset
+    # ------------------------------------------------------------------
+
+    async def setup(self) -> None:
+        """Load the FRAMES benchmark or fall back to built-in samples."""
+        if HF_AVAILABLE:
+            try:
+                logger.info("Loading FRAMES benchmark from HuggingFace...")
+                ds = load_dataset("google/frames-benchmark", split="test")
+                self._items = [
+                    {
+                        "question": row["Prompt"],
+                        "answer": row["Answer"],
+                        "difficulty": row.get("reasoning_types", "unknown"),
+                        "hops": 2,
+                    }
+                    for row in ds
+                ]
+                # Hold out 10% for eval
+                eval_size = max(20, len(self._items) // 10)
+                random.shuffle(self._items)
+                self._eval_items = self._items[:eval_size]
+                self._items = self._items[eval_size:]
+                logger.info(
+                    f"Loaded {len(self._items)} train / {len(self._eval_items)} eval items "
+                    f"from FRAMES benchmark."
+                )
+                return
+            except Exception as e:
+                logger.warning(f"Could not load FRAMES from HuggingFace: {e}. Using built-in samples.")
+
+        # Fallback
+        random.shuffle(SAMPLE_QUESTIONS)
+        split = max(1, len(SAMPLE_QUESTIONS) * 8 // 10)
+        self._items = SAMPLE_QUESTIONS[:split]
+        self._eval_items = SAMPLE_QUESTIONS[split:]
+        logger.info(
+            f"Using built-in sample dataset: {len(self._items)} train / "
+            f"{len(self._eval_items)} eval items."
+        )
+
+    # ------------------------------------------------------------------
+    # 2. get_next_item — return the next question
+    # ------------------------------------------------------------------
+
+    async def get_next_item(self) -> dict:
+        """Return the next item, cycling through the dataset."""
+        if not self._items:
+            raise RuntimeError("Dataset is empty. Did you call setup()?")
+        item = self._items[self._index % len(self._items)]
+        self._index += 1
+        return item
+
+    # ------------------------------------------------------------------
+    # 3. format_prompt — build the user-facing prompt
+    # ------------------------------------------------------------------
+
+    def format_prompt(self, item: dict) -> str:
+        """
+        Format the research question as a task prompt.
+        Instructs the model to use web search and cite sources.
+        """
+        return (
+            f"Research the following question thoroughly using web search. "
+            f"You MUST search the web to find current, accurate information — "
+            f"do not rely solely on your training data.\n\n"
+            f"Question: {item['question']}\n\n"
+            f"Requirements:\n"
+            f"- Use web_search and/or web_extract tools to find information\n"
+            f"- Search at least 2 different sources\n"
+            f"- Provide a concise, accurate answer (2-4 sentences)\n"
+            f"- Cite the sources you used"
+        )
+
+    # ------------------------------------------------------------------
+    # 4. compute_reward — multi-signal scoring
+    # ------------------------------------------------------------------
+
+    async def compute_reward(
+        self,
+        item: dict,
+        result: dict,
+        ctx: Any,  # ToolContext
+    ) -> float:
+        """
+        Multi-signal reward function:
+
+          0.6 * correctness   — LLM judge comparing answer to ground truth
+          0.2 * tool_used     — binary: did the model use web tools?
+          0.2 * efficiency    — penalizes wasteful tool usage
+          +0.1 bonus          — source diversity (≥2 distinct domains)
+        """
+        final_response: str = result.get("final_response", "")
+        tools_used: list[str] = result.get("tools_used", [])
+        tool_call_count: int = result.get("tool_call_count", len(tools_used))
+
+        # ---- Signal 1: Answer correctness (LLM judge) ----------------
+        correctness = await self._llm_judge(
+            question=item["question"],
+            expected=item["answer"],
+            model_answer=final_response,
+            ctx=ctx,
+        )
+
+        # ---- Signal 2: Web tool usage --------------------------------
+        web_tools = {"web_search", "web_extract", "search", "firecrawl"}
+        tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0
+
+        # ---- Signal 3: Efficiency ------------------------------------
+        # Ideal: 2-5 tool calls. Penalise beyond 6, hard cap at 15.
+        if tool_call_count <= 5:
+            efficiency = 1.0
+        elif tool_call_count <= 10:
+            efficiency = 1.0 - (tool_call_count - 5) * 0.08
+        else:
+            efficiency = max(0.0, 1.0 - (tool_call_count - 5) * 0.12)
+
+        # ---- Bonus: Source diversity ---------------------------------
+        domains = self._extract_domains(final_response)
+        diversity_bonus = 0.1 if len(domains) >= 2 else 0.0
+
+        # ---- Combine ------------------------------------------------
+        reward = (
+            0.6 * correctness
+            + 0.2 * tool_used
+            + 0.2 * efficiency
+            + diversity_bonus
+        )
+        reward = min(1.0, max(0.0, reward))  # clamp to [0, 1]
+
+        # Track running stats
+        self._total_scored += 1
+        self._total_reward += reward
+
+        logger.debug(
+            f"Reward breakdown — correctness={correctness:.2f}, "
+            f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, "
+            f"diversity_bonus={diversity_bonus:.1f} → total={reward:.3f}"
+        )
+
+        return reward
+
+    # ------------------------------------------------------------------
+    # 5. evaluate — run on held-out eval split
+    # ------------------------------------------------------------------
+
+    async def evaluate(
+        self,
+        *args: Any,
+        eval_size: Optional[int] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """
+        Run evaluation on the held-out split.
+        Returns a dict of metrics for logging.
+        """
+        items = self._eval_items
+        if eval_size:
+            items = items[:eval_size]
+
+        if not items:
+            logger.warning("No eval items available.")
+            return {}
+
+        logger.info(f"Running eval on {len(items)} questions...")
+
+        rewards = []
+        correctness_scores = []
+
+        for item in items:
+            try:
+                # Run the agent on each eval question
+                result = await self._run_agent_on_item(item)
+                reward = await self.compute_reward(item, result, ctx=None)
+                rewards.append(reward)
+
+                # Also track raw correctness separately
+                if result.get("final_response"):
+                    correctness_scores.append(
+                        await self._llm_judge(
+                            question=item["question"],
+                            expected=item["answer"],
+                            model_answer=result["final_response"],
+                            ctx=None,
+                        )
+                    )
+            except Exception as e:
+                logger.error(f"Eval error on item: {e}")
+                rewards.append(0.0)
+
+        metrics = {
+            "eval/mean_reward": sum(rewards) / len(rewards) if rewards else 0.0,
+            "eval/mean_correctness": (
+                sum(correctness_scores) / len(correctness_scores)
+                if correctness_scores else 0.0
+            ),
+            "eval/n_items": len(rewards),
+            "train/mean_reward_so_far": (
+                self._total_reward / self._total_scored
+                if self._total_scored > 0 else 0.0
+            ),
+        }
+
+        logger.info(
+            f"Eval complete — mean_reward={metrics['eval/mean_reward']:.3f}, "
+            f"mean_correctness={metrics['eval/mean_correctness']:.3f}"
+        )
+        return metrics
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    async def _llm_judge(
+        self,
+        question: str,
+        expected: str,
+        model_answer: str,
+        ctx: Any,
+    ) -> float:
+        """
+        Use an LLM to judge whether `model_answer` correctly addresses
+        `question` compared to `expected`. Returns a float in [0, 1].
+
+        Uses the agent's own inference client if ctx is available,
+        otherwise falls back to a lightweight heuristic.
+        """
+        if not model_answer or not model_answer.strip():
+            return 0.0
+
+        # Build judge prompt
+        judge_prompt = (
+            "You are an impartial judge evaluating the quality of an AI research answer.\n\n"
+            f"Question: {question}\n\n"
+            f"Reference answer: {expected}\n\n"
+            f"Model answer: {model_answer}\n\n"
+            "Score the model answer on a scale from 0.0 to 1.0 where:\n"
+            "  1.0 = fully correct and complete\n"
+            "  0.7 = mostly correct with minor gaps\n"
+            "  0.4 = partially correct\n"
+            "  0.1 = mentions relevant topic but wrong or very incomplete\n"
+            "  0.0 = completely wrong or no answer\n\n"
+            "Consider: factual accuracy, completeness, and relevance.\n"
+            "Respond with ONLY a JSON object: {\"score\": <float>, \"reason\": \"<one sentence>\"}"
+        )
+
+        # Try using ctx for inference (Phase 2 / live training)
+        if ctx is not None and hasattr(ctx, "chat_completion"):
+            try:
+                response = await ctx.chat_completion(
+                    messages=[{"role": "user", "content": judge_prompt}],
+                    max_tokens=100,
+                    temperature=0.0,
+                )
+                text = response.get("content", "")
+                parsed = self._parse_judge_json(text)
+                if parsed is not None:
+                    return float(parsed)
+            except Exception as e:
+                logger.debug(f"LLM judge via ctx failed: {e}. Using heuristic.")
+
+        # Fallback: keyword overlap heuristic
+        return self._heuristic_score(expected, model_answer)
+
+    @staticmethod
+    def _parse_judge_json(text: str) -> Optional[float]:
+        """Extract the score float from LLM judge JSON response."""
+        try:
+            # Strip markdown code fences if present
+            clean = re.sub(r"```(?:json)?|```", "", text).strip()
+            data = json.loads(clean)
+            score = float(data.get("score", -1))
+            if 0.0 <= score <= 1.0:
+                return score
+        except Exception:
+            # Try regex fallback
+            match = re.search(r'"score"\s*:\s*([0-9.]+)', text)
+            if match:
+                score = float(match.group(1))
+                if 0.0 <= score <= 1.0:
+                    return score
+        return None
+
+    @staticmethod
+    def _heuristic_score(expected: str, model_answer: str) -> float:
+        """
+        Lightweight keyword overlap score as fallback when no LLM is available.
+        Extracts meaningful tokens and computes Jaccard similarity.
+        """
+        stopwords = {
+            "the", "a", "an", "is", "are", "was", "were", "of", "in", "on",
+            "at", "to", "for", "with", "and", "or", "but", "it", "its",
+            "this", "that", "as", "by", "from", "be", "has", "have", "had",
+        }
+
+        def tokenize(text: str) -> set:
+            tokens = re.findall(r'\b[a-zA-Z0-9]+\b', text.lower())
+            return {t for t in tokens if t not in stopwords and len(t) > 2}
+
+        expected_tokens = tokenize(expected)
+        answer_tokens = tokenize(model_answer)
+
+        if not expected_tokens:
+            return 0.5  # Can't judge
+
+        overlap = len(expected_tokens & answer_tokens)
+        union = len(expected_tokens | answer_tokens)
+
+        jaccard = overlap / union if union > 0 else 0.0
+        # Recall-weighted: reward covering expected content
+        recall = overlap / len(expected_tokens)
+        return min(1.0, 0.4 * jaccard + 0.6 * recall)
+
+    @staticmethod
+    def _extract_domains(text: str) -> set:
+        """
+        Extract unique domains from URLs cited in the response.
+        Used to measure source diversity.
+        """
+        urls = re.findall(r'https?://[^\s\)>\]"\']+', text)
+        domains = set()
+        for url in urls:
+            try:
+                parsed = urlparse(url)
+                # Normalize: strip www.
+                domain = parsed.netloc.lower().lstrip("www.")
+                if domain:
+                    domains.add(domain)
+            except Exception:
+                pass
+        return domains
+
+    async def _run_agent_on_item(self, item: dict) -> dict:
+        """
+        Stub for running agent during eval. In Phase 1/2, this is handled
+        by the Atropos framework's rollout mechanism. Provided here for
+        standalone eval compatibility.
+        """
+        # In real usage, the framework calls get_next_item + format_prompt
+        # and runs the agent. This stub returns an empty result for safety.
+        return {
+            "final_response": "",
+            "tools_used": [],
+            "tool_call_count": 0,
+        }
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    WebResearchEnv.cli()

From 36214d14db03cc17f8e16cf7c21333baaca27592 Mon Sep 17 00:00:00 2001
From: PercyDikec <percydikec@gmail.com>
Date: Thu, 5 Mar 2026 21:12:53 +0300
Subject: [PATCH 03/14] fix(cli): use correct visibility filter string in codex
 API model fetch

---
 hermes_cli/codex_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hermes_cli/codex_models.py b/hermes_cli/codex_models.py
index 416c76add5..662e576dc2 100644
--- a/hermes_cli/codex_models.py
+++ b/hermes_cli/codex_models.py
@@ -47,7 +47,7 @@ def _fetch_models_from_api(access_token: str) -> List[str]:
         if item.get("supported_in_api") is False:
             continue
         visibility = item.get("visibility", "")
-        if isinstance(visibility, str) and visibility.strip().lower() == "hide":
+        if isinstance(visibility, str) and visibility.strip().lower() == "hidden":
             continue
         priority = item.get("priority")
         rank = int(priority) if isinstance(priority, (int, float)) else 10_000

From 34e8d088c21f072a6f2fc9ffdaacbcd47e2a324e Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 13:02:59 -0700
Subject: [PATCH 04/14] feat(slack): fix app_mention 404 + add document/video
 support

- Register no-op app_mention event handler to suppress Bolt 404 errors.
  The 'message' handler already processes @mentions in channels, so
  app_mention is acknowledged without duplicate processing.

- Add send_document() for native file attachments (PDFs, CSVs, etc.)
  via files_upload_v2, matching the pattern from Telegram PR #779.

- Add send_video() for native video uploads via files_upload_v2.

- Handle incoming document attachments from users: download, cache,
  and inject text content for .txt/.md files (capped at 100KB),
  following the same pattern as the Telegram adapter.

- Add _download_slack_file_bytes() helper for raw byte downloads.

- Add 24 new tests covering all new functionality.

Fixes the unhandled app_mention events reported in gateway logs.
---
 gateway/platforms/slack.py  | 134 +++++++++
 tests/gateway/test_slack.py | 532 ++++++++++++++++++++++++++++++++++++
 2 files changed, 666 insertions(+)
 create mode 100644 tests/gateway/test_slack.py

diff --git a/gateway/platforms/slack.py b/gateway/platforms/slack.py
index 11a73461e7..020843d3ac 100644
--- a/gateway/platforms/slack.py
+++ b/gateway/platforms/slack.py
@@ -10,6 +10,7 @@ Uses slack-bolt (Python) with Socket Mode for:
 
 import asyncio
 import os
+import re
 from typing import Dict, List, Optional, Any
 
 try:
@@ -33,6 +34,8 @@ from gateway.platforms.base import (
     MessageEvent,
     MessageType,
     SendResult,
+    SUPPORTED_DOCUMENT_TYPES,
+    cache_document_from_bytes,
     cache_image_from_url,
     cache_audio_from_url,
 )
@@ -96,6 +99,13 @@ class SlackAdapter(BasePlatformAdapter):
             async def handle_message_event(event, say):
                 await self._handle_slack_message(event)
 
+            # Acknowledge app_mention events to prevent Bolt 404 errors.
+            # The "message" handler above already processes @mentions in
+            # channels, so this is intentionally a no-op to avoid duplicates.
+            @self._app.event("app_mention")
+            async def handle_app_mention(event, say):
+                pass
+
             # Register slash command handler
             @self._app.command("/hermes")
             async def handle_hermes_command(ack, command):
@@ -266,6 +276,65 @@ class SlackAdapter(BasePlatformAdapter):
         except Exception as e:
             return SendResult(success=False, error=str(e))
 
+    async def send_video(
+        self,
+        chat_id: str,
+        video_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Send a video file to Slack."""
+        if not self._app:
+            return SendResult(success=False, error="Not connected")
+
+        if not os.path.exists(video_path):
+            return SendResult(success=False, error=f"Video file not found: {video_path}")
+
+        try:
+            result = await self._app.client.files_upload_v2(
+                channel=chat_id,
+                file=video_path,
+                filename=os.path.basename(video_path),
+                initial_comment=caption or "",
+                thread_ts=reply_to,
+            )
+            return SendResult(success=True, raw_response=result)
+
+        except Exception as e:
+            print(f"[{self.name}] Failed to send video: {e}")
+            return await super().send_video(chat_id, video_path, caption, reply_to)
+
+    async def send_document(
+        self,
+        chat_id: str,
+        file_path: str,
+        caption: Optional[str] = None,
+        file_name: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Send a document/file attachment to Slack."""
+        if not self._app:
+            return SendResult(success=False, error="Not connected")
+
+        if not os.path.exists(file_path):
+            return SendResult(success=False, error=f"File not found: {file_path}")
+
+        display_name = file_name or os.path.basename(file_path)
+
+        try:
+            result = await self._app.client.files_upload_v2(
+                channel=chat_id,
+                file=file_path,
+                filename=display_name,
+                initial_comment=caption or "",
+                thread_ts=reply_to,
+            )
+            return SendResult(success=True, raw_response=result)
+
+        except Exception as e:
+            print(f"[{self.name}] Failed to send document: {e}")
+            return await super().send_document(chat_id, file_path, caption, file_name, reply_to)
+
     async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
         """Get information about a Slack channel."""
         if not self._app:
@@ -347,6 +416,58 @@ class SlackAdapter(BasePlatformAdapter):
                     msg_type = MessageType.VOICE
                 except Exception as e:
                     print(f"[Slack] Failed to cache audio: {e}", flush=True)
+            elif url:
+                # Try to handle as a document attachment
+                try:
+                    original_filename = f.get("name", "")
+                    ext = ""
+                    if original_filename:
+                        _, ext = os.path.splitext(original_filename)
+                        ext = ext.lower()
+
+                    # Fallback: reverse-lookup from MIME type
+                    if not ext and mimetype:
+                        mime_to_ext = {v: k for k, v in SUPPORTED_DOCUMENT_TYPES.items()}
+                        ext = mime_to_ext.get(mimetype, "")
+
+                    if ext not in SUPPORTED_DOCUMENT_TYPES:
+                        continue  # Skip unsupported file types silently
+
+                    # Check file size (Slack limit: 20 MB for bots)
+                    file_size = f.get("size", 0)
+                    MAX_DOC_BYTES = 20 * 1024 * 1024
+                    if not file_size or file_size > MAX_DOC_BYTES:
+                        print(f"[Slack] Document too large or unknown size: {file_size}", flush=True)
+                        continue
+
+                    # Download and cache
+                    raw_bytes = await self._download_slack_file_bytes(url)
+                    cached_path = cache_document_from_bytes(
+                        raw_bytes, original_filename or f"document{ext}"
+                    )
+                    doc_mime = SUPPORTED_DOCUMENT_TYPES[ext]
+                    media_urls.append(cached_path)
+                    media_types.append(doc_mime)
+                    msg_type = MessageType.DOCUMENT
+                    print(f"[Slack] Cached user document: {cached_path}", flush=True)
+
+                    # Inject text content for .txt/.md files (capped at 100 KB)
+                    MAX_TEXT_INJECT_BYTES = 100 * 1024
+                    if ext in (".md", ".txt") and len(raw_bytes) <= MAX_TEXT_INJECT_BYTES:
+                        try:
+                            text_content = raw_bytes.decode("utf-8")
+                            display_name = original_filename or f"document{ext}"
+                            display_name = re.sub(r'[^\w.\- ]', '_', display_name)
+                            injection = f"[Content of {display_name}]:\n{text_content}"
+                            if text:
+                                text = f"{injection}\n\n{text}"
+                            else:
+                                text = injection
+                        except UnicodeDecodeError:
+                            pass  # Binary content, skip injection
+
+                except Exception as e:
+                    print(f"[Slack] Failed to cache document: {e}", flush=True)
 
         # Build source
         source = self.build_source(
@@ -427,3 +548,16 @@ class SlackAdapter(BasePlatformAdapter):
         else:
             from gateway.platforms.base import cache_image_from_bytes
             return cache_image_from_bytes(response.content, ext)
+
+    async def _download_slack_file_bytes(self, url: str) -> bytes:
+        """Download a Slack file and return raw bytes."""
+        import httpx
+
+        bot_token = self.config.token
+        async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
+            response = await client.get(
+                url,
+                headers={"Authorization": f"Bearer {bot_token}"},
+            )
+            response.raise_for_status()
+        return response.content
diff --git a/tests/gateway/test_slack.py b/tests/gateway/test_slack.py
new file mode 100644
index 0000000000..efdb62ce49
--- /dev/null
+++ b/tests/gateway/test_slack.py
@@ -0,0 +1,532 @@
+"""
+Tests for Slack platform adapter.
+
+Covers: app_mention handler, send_document, send_video,
+        incoming document handling, message routing.
+
+Note: slack-bolt may not be installed in the test environment.
+We mock the slack modules at import time to avoid collection errors.
+"""
+
+import asyncio
+import os
+import sys
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from gateway.config import Platform, PlatformConfig
+from gateway.platforms.base import (
+    MessageEvent,
+    MessageType,
+    SendResult,
+    SUPPORTED_DOCUMENT_TYPES,
+)
+
+
+# ---------------------------------------------------------------------------
+# Mock the slack-bolt package if it's not installed
+# ---------------------------------------------------------------------------
+
+def _ensure_slack_mock():
+    """Install mock slack modules so SlackAdapter can be imported."""
+    if "slack_bolt" in sys.modules and hasattr(sys.modules["slack_bolt"], "__file__"):
+        return  # Real library installed
+
+    slack_bolt = MagicMock()
+    slack_bolt.async_app.AsyncApp = MagicMock
+    slack_bolt.adapter.socket_mode.async_handler.AsyncSocketModeHandler = MagicMock
+
+    slack_sdk = MagicMock()
+    slack_sdk.web.async_client.AsyncWebClient = MagicMock
+
+    for name, mod in [
+        ("slack_bolt", slack_bolt),
+        ("slack_bolt.async_app", slack_bolt.async_app),
+        ("slack_bolt.adapter", slack_bolt.adapter),
+        ("slack_bolt.adapter.socket_mode", slack_bolt.adapter.socket_mode),
+        ("slack_bolt.adapter.socket_mode.async_handler", slack_bolt.adapter.socket_mode.async_handler),
+        ("slack_sdk", slack_sdk),
+        ("slack_sdk.web", slack_sdk.web),
+        ("slack_sdk.web.async_client", slack_sdk.web.async_client),
+    ]:
+        sys.modules.setdefault(name, mod)
+
+
+_ensure_slack_mock()
+
+# Patch SLACK_AVAILABLE before importing the adapter
+import gateway.platforms.slack as _slack_mod
+_slack_mod.SLACK_AVAILABLE = True
+
+from gateway.platforms.slack import SlackAdapter  # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture()
+def adapter():
+    config = PlatformConfig(enabled=True, token="xoxb-fake-token")
+    a = SlackAdapter(config)
+    # Mock the Slack app client
+    a._app = MagicMock()
+    a._app.client = AsyncMock()
+    a._bot_user_id = "U_BOT"
+    a._running = True
+    # Capture events instead of processing them
+    a.handle_message = AsyncMock()
+    return a
+
+
+@pytest.fixture(autouse=True)
+def _redirect_cache(tmp_path, monkeypatch):
+    """Point document cache to tmp_path so tests don't touch ~/.hermes."""
+    monkeypatch.setattr(
+        "gateway.platforms.base.DOCUMENT_CACHE_DIR", tmp_path / "doc_cache"
+    )
+
+
+# ---------------------------------------------------------------------------
+# TestAppMentionHandler
+# ---------------------------------------------------------------------------
+
+class TestAppMentionHandler:
+    """Verify that the app_mention event handler is registered."""
+
+    def test_app_mention_registered_on_connect(self):
+        """connect() should register both 'message' and 'app_mention' handlers."""
+        config = PlatformConfig(enabled=True, token="xoxb-fake")
+        adapter = SlackAdapter(config)
+
+        # Track which events get registered
+        registered_events = []
+        registered_commands = []
+
+        mock_app = MagicMock()
+
+        def mock_event(event_type):
+            def decorator(fn):
+                registered_events.append(event_type)
+                return fn
+            return decorator
+
+        def mock_command(cmd):
+            def decorator(fn):
+                registered_commands.append(cmd)
+                return fn
+            return decorator
+
+        mock_app.event = mock_event
+        mock_app.command = mock_command
+        mock_app.client = AsyncMock()
+        mock_app.client.auth_test = AsyncMock(return_value={
+            "user_id": "U_BOT",
+            "user": "testbot",
+        })
+
+        with patch.object(_slack_mod, "AsyncApp", return_value=mock_app), \
+             patch.object(_slack_mod, "AsyncSocketModeHandler", return_value=MagicMock()), \
+             patch.dict(os.environ, {"SLACK_APP_TOKEN": "xapp-fake"}), \
+             patch("asyncio.create_task"):
+            asyncio.get_event_loop().run_until_complete(adapter.connect())
+
+        assert "message" in registered_events
+        assert "app_mention" in registered_events
+        assert "/hermes" in registered_commands
+
+
+# ---------------------------------------------------------------------------
+# TestSendDocument
+# ---------------------------------------------------------------------------
+
+class TestSendDocument:
+    @pytest.mark.asyncio
+    async def test_send_document_success(self, adapter, tmp_path):
+        test_file = tmp_path / "report.pdf"
+        test_file.write_bytes(b"%PDF-1.4 fake content")
+
+        adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True})
+
+        result = await adapter.send_document(
+            chat_id="C123",
+            file_path=str(test_file),
+            caption="Here's the report",
+        )
+
+        assert result.success
+        adapter._app.client.files_upload_v2.assert_called_once()
+        call_kwargs = adapter._app.client.files_upload_v2.call_args[1]
+        assert call_kwargs["channel"] == "C123"
+        assert call_kwargs["file"] == str(test_file)
+        assert call_kwargs["filename"] == "report.pdf"
+        assert call_kwargs["initial_comment"] == "Here's the report"
+
+    @pytest.mark.asyncio
+    async def test_send_document_custom_name(self, adapter, tmp_path):
+        test_file = tmp_path / "data.csv"
+        test_file.write_bytes(b"a,b,c\n1,2,3")
+
+        adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True})
+
+        result = await adapter.send_document(
+            chat_id="C123",
+            file_path=str(test_file),
+            file_name="quarterly-report.csv",
+        )
+
+        assert result.success
+        call_kwargs = adapter._app.client.files_upload_v2.call_args[1]
+        assert call_kwargs["filename"] == "quarterly-report.csv"
+
+    @pytest.mark.asyncio
+    async def test_send_document_missing_file(self, adapter):
+        result = await adapter.send_document(
+            chat_id="C123",
+            file_path="/nonexistent/file.pdf",
+        )
+
+        assert not result.success
+        assert "not found" in result.error.lower()
+
+    @pytest.mark.asyncio
+    async def test_send_document_not_connected(self, adapter):
+        adapter._app = None
+        result = await adapter.send_document(
+            chat_id="C123",
+            file_path="/some/file.pdf",
+        )
+
+        assert not result.success
+        assert "Not connected" in result.error
+
+    @pytest.mark.asyncio
+    async def test_send_document_api_error_falls_back(self, adapter, tmp_path):
+        test_file = tmp_path / "doc.pdf"
+        test_file.write_bytes(b"content")
+
+        adapter._app.client.files_upload_v2 = AsyncMock(
+            side_effect=RuntimeError("Slack API error")
+        )
+
+        # Should fall back to base class (text message)
+        result = await adapter.send_document(
+            chat_id="C123",
+            file_path=str(test_file),
+        )
+
+        # Base class send() is also mocked, so check it was attempted
+        adapter._app.client.chat_postMessage.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_send_document_with_thread(self, adapter, tmp_path):
+        test_file = tmp_path / "notes.txt"
+        test_file.write_bytes(b"some notes")
+
+        adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True})
+
+        result = await adapter.send_document(
+            chat_id="C123",
+            file_path=str(test_file),
+            reply_to="1234567890.123456",
+        )
+
+        assert result.success
+        call_kwargs = adapter._app.client.files_upload_v2.call_args[1]
+        assert call_kwargs["thread_ts"] == "1234567890.123456"
+
+
+# ---------------------------------------------------------------------------
+# TestSendVideo
+# ---------------------------------------------------------------------------
+
+class TestSendVideo:
+    @pytest.mark.asyncio
+    async def test_send_video_success(self, adapter, tmp_path):
+        video = tmp_path / "clip.mp4"
+        video.write_bytes(b"fake video data")
+
+        adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True})
+
+        result = await adapter.send_video(
+            chat_id="C123",
+            video_path=str(video),
+            caption="Check this out",
+        )
+
+        assert result.success
+        call_kwargs = adapter._app.client.files_upload_v2.call_args[1]
+        assert call_kwargs["filename"] == "clip.mp4"
+        assert call_kwargs["initial_comment"] == "Check this out"
+
+    @pytest.mark.asyncio
+    async def test_send_video_missing_file(self, adapter):
+        result = await adapter.send_video(
+            chat_id="C123",
+            video_path="/nonexistent/video.mp4",
+        )
+
+        assert not result.success
+        assert "not found" in result.error.lower()
+
+    @pytest.mark.asyncio
+    async def test_send_video_not_connected(self, adapter):
+        adapter._app = None
+        result = await adapter.send_video(
+            chat_id="C123",
+            video_path="/some/video.mp4",
+        )
+
+        assert not result.success
+        assert "Not connected" in result.error
+
+    @pytest.mark.asyncio
+    async def test_send_video_api_error_falls_back(self, adapter, tmp_path):
+        video = tmp_path / "clip.mp4"
+        video.write_bytes(b"fake video")
+
+        adapter._app.client.files_upload_v2 = AsyncMock(
+            side_effect=RuntimeError("Slack API error")
+        )
+
+        # Should fall back to base class (text message)
+        result = await adapter.send_video(
+            chat_id="C123",
+            video_path=str(video),
+        )
+
+        adapter._app.client.chat_postMessage.assert_called_once()
+
+
+# ---------------------------------------------------------------------------
+# TestIncomingDocumentHandling
+# ---------------------------------------------------------------------------
+
+class TestIncomingDocumentHandling:
+    def _make_event(self, files=None, text="hello", channel_type="im"):
+        """Build a mock Slack message event with file attachments."""
+        return {
+            "text": text,
+            "user": "U_USER",
+            "channel": "C123",
+            "channel_type": channel_type,
+            "ts": "1234567890.000001",
+            "files": files or [],
+        }
+
+    @pytest.mark.asyncio
+    async def test_pdf_document_cached(self, adapter):
+        """A PDF attachment should be downloaded, cached, and set as DOCUMENT type."""
+        pdf_bytes = b"%PDF-1.4 fake content"
+
+        with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
+            dl.return_value = pdf_bytes
+            event = self._make_event(files=[{
+                "mimetype": "application/pdf",
+                "name": "report.pdf",
+                "url_private_download": "https://files.slack.com/report.pdf",
+                "size": len(pdf_bytes),
+            }])
+            await adapter._handle_slack_message(event)
+
+        msg_event = adapter.handle_message.call_args[0][0]
+        assert msg_event.message_type == MessageType.DOCUMENT
+        assert len(msg_event.media_urls) == 1
+        assert os.path.exists(msg_event.media_urls[0])
+        assert msg_event.media_types == ["application/pdf"]
+
+    @pytest.mark.asyncio
+    async def test_txt_document_injects_content(self, adapter):
+        """A .txt file under 100KB should have its content injected into event text."""
+        content = b"Hello from a text file"
+
+        with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
+            dl.return_value = content
+            event = self._make_event(
+                text="summarize this",
+                files=[{
+                    "mimetype": "text/plain",
+                    "name": "notes.txt",
+                    "url_private_download": "https://files.slack.com/notes.txt",
+                    "size": len(content),
+                }],
+            )
+            await adapter._handle_slack_message(event)
+
+        msg_event = adapter.handle_message.call_args[0][0]
+        assert "Hello from a text file" in msg_event.text
+        assert "[Content of notes.txt]" in msg_event.text
+        assert "summarize this" in msg_event.text
+
+    @pytest.mark.asyncio
+    async def test_md_document_injects_content(self, adapter):
+        """A .md file under 100KB should have its content injected."""
+        content = b"# Title\nSome markdown content"
+
+        with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
+            dl.return_value = content
+            event = self._make_event(files=[{
+                "mimetype": "text/markdown",
+                "name": "readme.md",
+                "url_private_download": "https://files.slack.com/readme.md",
+                "size": len(content),
+            }], text="")
+            await adapter._handle_slack_message(event)
+
+        msg_event = adapter.handle_message.call_args[0][0]
+        assert "# Title" in msg_event.text
+
+    @pytest.mark.asyncio
+    async def test_large_txt_not_injected(self, adapter):
+        """A .txt file over 100KB should be cached but NOT injected."""
+        content = b"x" * (200 * 1024)
+
+        with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
+            dl.return_value = content
+            event = self._make_event(files=[{
+                "mimetype": "text/plain",
+                "name": "big.txt",
+                "url_private_download": "https://files.slack.com/big.txt",
+                "size": len(content),
+            }], text="")
+            await adapter._handle_slack_message(event)
+
+        msg_event = adapter.handle_message.call_args[0][0]
+        assert len(msg_event.media_urls) == 1
+        assert "[Content of" not in (msg_event.text or "")
+
+    @pytest.mark.asyncio
+    async def test_unsupported_file_type_skipped(self, adapter):
+        """A .zip file should be silently skipped."""
+        event = self._make_event(files=[{
+            "mimetype": "application/zip",
+            "name": "archive.zip",
+            "url_private_download": "https://files.slack.com/archive.zip",
+            "size": 1024,
+        }])
+        await adapter._handle_slack_message(event)
+
+        msg_event = adapter.handle_message.call_args[0][0]
+        assert msg_event.message_type == MessageType.TEXT
+        assert len(msg_event.media_urls) == 0
+
+    @pytest.mark.asyncio
+    async def test_oversized_document_skipped(self, adapter):
+        """A document over 20MB should be skipped."""
+        event = self._make_event(files=[{
+            "mimetype": "application/pdf",
+            "name": "huge.pdf",
+            "url_private_download": "https://files.slack.com/huge.pdf",
+            "size": 25 * 1024 * 1024,
+        }])
+        await adapter._handle_slack_message(event)
+
+        msg_event = adapter.handle_message.call_args[0][0]
+        assert len(msg_event.media_urls) == 0
+
+    @pytest.mark.asyncio
+    async def test_document_download_error_handled(self, adapter):
+        """If document download fails, handler should not crash."""
+        with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
+            dl.side_effect = RuntimeError("download failed")
+            event = self._make_event(files=[{
+                "mimetype": "application/pdf",
+                "name": "report.pdf",
+                "url_private_download": "https://files.slack.com/report.pdf",
+                "size": 1024,
+            }])
+            await adapter._handle_slack_message(event)
+
+        # Handler should still be called (the exception is caught)
+        adapter.handle_message.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_image_still_handled(self, adapter):
+        """Image attachments should still go through the image path, not document."""
+        with patch.object(adapter, "_download_slack_file", new_callable=AsyncMock) as dl:
+            dl.return_value = "/tmp/cached_image.jpg"
+            event = self._make_event(files=[{
+                "mimetype": "image/jpeg",
+                "name": "photo.jpg",
+                "url_private_download": "https://files.slack.com/photo.jpg",
+                "size": 1024,
+            }])
+            await adapter._handle_slack_message(event)
+
+        msg_event = adapter.handle_message.call_args[0][0]
+        assert msg_event.message_type == MessageType.PHOTO
+
+
+# ---------------------------------------------------------------------------
+# TestMessageRouting
+# ---------------------------------------------------------------------------
+
+class TestMessageRouting:
+    @pytest.mark.asyncio
+    async def test_dm_processed_without_mention(self, adapter):
+        """DM messages should be processed without requiring a bot mention."""
+        event = {
+            "text": "hello",
+            "user": "U_USER",
+            "channel": "D123",
+            "channel_type": "im",
+            "ts": "1234567890.000001",
+        }
+        await adapter._handle_slack_message(event)
+        adapter.handle_message.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_channel_message_requires_mention(self, adapter):
+        """Channel messages without a bot mention should be ignored."""
+        event = {
+            "text": "just talking",
+            "user": "U_USER",
+            "channel": "C123",
+            "channel_type": "channel",
+            "ts": "1234567890.000001",
+        }
+        await adapter._handle_slack_message(event)
+        adapter.handle_message.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_channel_mention_strips_bot_id(self, adapter):
+        """When mentioned in a channel, the bot mention should be stripped."""
+        event = {
+            "text": "<@U_BOT> what's the weather?",
+            "user": "U_USER",
+            "channel": "C123",
+            "channel_type": "channel",
+            "ts": "1234567890.000001",
+        }
+        await adapter._handle_slack_message(event)
+        msg_event = adapter.handle_message.call_args[0][0]
+        assert msg_event.text == "what's the weather?"
+        assert "<@U_BOT>" not in msg_event.text
+
+    @pytest.mark.asyncio
+    async def test_bot_messages_ignored(self, adapter):
+        """Messages from bots should be ignored."""
+        event = {
+            "text": "bot response",
+            "bot_id": "B_OTHER",
+            "channel": "C123",
+            "channel_type": "im",
+            "ts": "1234567890.000001",
+        }
+        await adapter._handle_slack_message(event)
+        adapter.handle_message.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_message_edits_ignored(self, adapter):
+        """Message edits should be ignored."""
+        event = {
+            "text": "edited message",
+            "user": "U_USER",
+            "channel": "C123",
+            "channel_type": "im",
+            "ts": "1234567890.000001",
+            "subtype": "message_changed",
+        }
+        await adapter._handle_slack_message(event)
+        adapter.handle_message.assert_not_called()

From ac58309dbdb363692a4bd853364533244620e548 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 14:00:11 -0700
Subject: [PATCH 05/14] docs: improve Slack setup guide with channel event
 subscriptions and scopes

The #1 support issue with Slack is 'bot works in DMs but not channels'.
This is almost always caused by missing event subscriptions (message.channels,
message.groups) or missing OAuth scopes (channels:history, groups:history).

Changes:
- slack.md: Move channels:history and groups:history from optional to required
  scopes. Move message.channels and message.groups to required events. Add new
  'How the Bot Responds' section explaining DM vs channel behavior. Add Step 8
  for inviting bot to channels. Expand troubleshooting table with specific
  'works in DMs not channels' entry. Add quick checklist for channel debugging.
- setup.py: Expand Slack setup wizard with all required scopes, event
  subscriptions, and a warning that without message.channels/message.groups
  the bot only works in DMs. Add link to full docs. Improve Member ID
  discovery instructions.
- config.py: Update SLACK_BOT_TOKEN and SLACK_APP_TOKEN descriptions to list
  required scopes and event subscriptions inline.
---
 hermes_cli/config.py                       |  8 +-
 hermes_cli/setup.py                        | 22 ++++--
 website/docs/user-guide/messaging/slack.md | 89 +++++++++++++++++-----
 3 files changed, 95 insertions(+), 24 deletions(-)

diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index 7a31b551d4..7b689d764c 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -401,14 +401,18 @@ OPTIONAL_ENV_VARS = {
         "category": "messaging",
     },
     "SLACK_BOT_TOKEN": {
-        "description": "Slack bot integration",
+        "description": "Slack bot token (xoxb-). Get from OAuth & Permissions after installing your app. "
+                       "Required scopes: chat:write, app_mentions:read, channels:history, groups:history, "
+                       "im:history, im:read, im:write, users:read, files:write",
         "prompt": "Slack Bot Token (xoxb-...)",
         "url": "https://api.slack.com/apps",
         "password": True,
         "category": "messaging",
     },
     "SLACK_APP_TOKEN": {
-        "description": "Slack Socket Mode connection",
+        "description": "Slack app-level token (xapp-) for Socket Mode. Get from Basic Information → "
+                       "App-Level Tokens. Also ensure Event Subscriptions include: message.im, "
+                       "message.channels, message.groups, app_mention",
         "prompt": "Slack App Token (xapp-...)",
         "url": "https://api.slack.com/apps",
         "password": True,
diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py
index c10caec9b0..5880b7ef35 100644
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -1572,10 +1572,22 @@ def setup_gateway(config: dict):
     
     if not existing_slack and prompt_yes_no("Set up Slack bot?", False):
         print_info("Steps to create a Slack app:")
-        print_info("   1. Go to https://api.slack.com/apps → Create New App")
-        print_info("   2. Enable Socket Mode: App Settings → Socket Mode → Enable")
-        print_info("   3. Bot Token: OAuth & Permissions → Install to Workspace")
-        print_info("   4. App Token: Basic Information → App-Level Tokens → Generate")
+        print_info("   1. Go to https://api.slack.com/apps → Create New App (from scratch)")
+        print_info("   2. Enable Socket Mode: Settings → Socket Mode → Enable")
+        print_info("      • Create an App-Level Token with 'connections:write' scope")
+        print_info("   3. Add Bot Token Scopes: Features → OAuth & Permissions")
+        print_info("      Required scopes: chat:write, app_mentions:read,")
+        print_info("      channels:history, channels:read, groups:history,")
+        print_info("      im:history, im:read, im:write, users:read, files:write")
+        print_info("   4. Subscribe to Events: Features → Event Subscriptions → Enable")
+        print_info("      Required events: message.im, message.channels,")
+        print_info("      message.groups, app_mention")
+        print_warning("   ⚠ Without message.channels/message.groups events,")
+        print_warning("     the bot will ONLY work in DMs, not channels!")
+        print_info("   5. Install to Workspace: Settings → Install App")
+        print_info("   6. After installing, invite the bot to channels: /invite @YourBot")
+        print()
+        print_info("   Full guide: https://hermes-agent.ai/docs/user-guide/messaging/slack")
         print()
         bot_token = prompt("Slack Bot Token (xoxb-...)", password=True)
         if bot_token:
@@ -1587,7 +1599,7 @@ def setup_gateway(config: dict):
             
             print()
             print_info("🔒 Security: Restrict who can use your bot")
-            print_info("   Find Slack user IDs in your profile or via the Slack API")
+            print_info("   To find a Member ID: click a user's name → View full profile → ⋮ → Copy member ID")
             print()
             allowed_users = prompt("Allowed user IDs (comma-separated, leave empty for open access)")
             if allowed_users:
diff --git a/website/docs/user-guide/messaging/slack.md b/website/docs/user-guide/messaging/slack.md
index 52dde5f6a9..65d27ee830 100644
--- a/website/docs/user-guide/messaging/slack.md
+++ b/website/docs/user-guide/messaging/slack.md
@@ -46,20 +46,26 @@ Navigate to **Features → OAuth & Permissions** in the sidebar. Scroll to **Sco
 | Scope | Purpose |
 |-------|---------|
 | `chat:write` | Send messages as the bot |
-| `app_mentions:read` | Respond when @mentioned in channels |
+| `app_mentions:read` | Detect when @mentioned in channels |
 | `channels:history` | Read messages in public channels the bot is in |
 | `channels:read` | List and get info about public channels |
+| `groups:history` | Read messages in private channels the bot is invited to |
 | `im:history` | Read direct message history |
 | `im:read` | View basic DM info |
 | `im:write` | Open and manage DMs |
 | `users:read` | Look up user information |
+| `files:write` | Upload files (images, audio, documents) |
+
+:::caution Missing scopes = missing features
+Without `channels:history` and `groups:history`, the bot **will not receive messages in channels** —
+it will only work in DMs. These are the most commonly missed scopes.
+:::
 
 **Optional scopes:**
 
 | Scope | Purpose |
 |-------|---------|
-| `groups:history` | Read messages in private channels the bot is invited to |
-| `files:write` | Upload files (audio, images) |
+| `groups:read` | List and get info about private channels |
 
 ---
 
@@ -83,23 +89,27 @@ You can always find or regenerate app-level tokens under **Settings → Basic In
 
 ## Step 4: Subscribe to Events
 
+This step is critical — it controls what messages the bot can see.
+
 1. In the sidebar, go to **Features → Event Subscriptions**
 2. Toggle **Enable Events** to ON
 3. Expand **Subscribe to bot events** and add:
 
-| Event | Purpose |
-|-------|---------|
-| `app_mention` | Bot responds when @mentioned in any channel |
-| `message.im` | Bot responds to direct messages |
-
-**Optional event:**
-
-| Event | Purpose |
-|-------|---------|
-| `message.channels` | Bot sees all messages in public channels it's added to |
+| Event | Required? | Purpose |
+|-------|-----------|---------|
+| `message.im` | **Yes** | Bot receives direct messages |
+| `message.channels` | **Yes** | Bot receives messages in **public** channels it's added to |
+| `message.groups` | **Recommended** | Bot receives messages in **private** channels it's invited to |
+| `app_mention` | **Yes** | Prevents Bolt SDK errors when bot is @mentioned |
 
 4. Click **Save Changes** at the bottom of the page
 
+:::danger Missing event subscriptions is the #1 setup issue
+If the bot works in DMs but **not in channels**, you almost certainly forgot to add
+`message.channels` (for public channels) and/or `message.groups` (for private channels).
+Without these events, Slack simply never delivers channel messages to the bot.
+:::
+
 ---
 
 ## Step 5: Install App to Workspace
@@ -111,8 +121,8 @@ You can always find or regenerate app-level tokens under **Settings → Basic In
 5. **Copy this token** — this is your `SLACK_BOT_TOKEN`
 
 :::tip
-If you change scopes later, you'll need to **reinstall the app** for the new scopes to take effect.
-The Install App page will show a banner prompting you to do so.
+If you change scopes or event subscriptions later, you **must reinstall the app** for the changes
+to take effect. The Install App page will show a banner prompting you to do so.
 :::
 
 ---
@@ -139,7 +149,7 @@ Add the following to your `~/.hermes/.env` file:
 ```bash
 # Required
 SLACK_BOT_TOKEN=xoxb-your-bot-token-here
-SLACK_APP_TOKEN=xapp-your-app-level-token-here
+SLACK_APP_TOKEN=xapp-your-app-token-here
 SLACK_ALLOWED_USERS=U01ABC2DEF3              # Comma-separated Member IDs
 
 # Optional
@@ -161,6 +171,35 @@ hermes gateway install      # Install as a system service
 
 ---
 
+## Step 8: Invite the Bot to Channels
+
+After starting the gateway, you need to **invite the bot** to any channel where you want it to respond:
+
+```
+/invite @Hermes Agent
+```
+
+The bot will **not** automatically join channels. You must invite it to each channel individually.
+
+---
+
+## How the Bot Responds
+
+Understanding how Hermes behaves in different contexts:
+
+| Context | Behavior |
+|---------|----------|
+| **DMs** | Bot responds to every message — no @mention needed |
+| **Channels** | Bot **only responds when @mentioned** (e.g., `@Hermes Agent what time is it?`) |
+| **Threads** | Bot replies in threads when the triggering message is in a thread |
+
+:::tip
+In channels, always @mention the bot. Simply typing a message without mentioning it will be ignored.
+This is intentional — it prevents the bot from responding to every message in busy channels.
+:::
+
+---
+
 ## Home Channel
 
 Set `SLACK_HOME_CHANNEL` to a channel ID where Hermes will deliver scheduled messages,
@@ -192,11 +231,27 @@ Hermes supports voice on Slack:
 | Problem | Solution |
 |---------|----------|
 | Bot doesn't respond to DMs | Verify `message.im` is in your event subscriptions and the app is reinstalled |
-| Bot doesn't respond to @mentions | Verify `app_mention` is in your event subscriptions |
+| Bot works in DMs but not in channels | **Most common issue.** Add `message.channels` and `message.groups` to event subscriptions, reinstall the app, and invite the bot to the channel with `/invite @Hermes Agent` |
+| Bot doesn't respond to @mentions in channels | 1) Check `message.channels` event is subscribed. 2) Bot must be invited to the channel. 3) Ensure `channels:history` scope is added. 4) Reinstall the app after scope/event changes |
+| Bot ignores messages in private channels | Add both the `message.groups` event subscription and `groups:history` scope, then reinstall the app and `/invite` the bot |
 | "not_authed" or "invalid_auth" errors | Regenerate your Bot Token and App Token, update `.env` |
 | Bot responds but can't post in a channel | Invite the bot to the channel with `/invite @Hermes Agent` |
 | "missing_scope" error | Add the required scope in OAuth & Permissions, then **reinstall** the app |
 | Socket disconnects frequently | Check your network; Bolt auto-reconnects but unstable connections cause lag |
+| Changed scopes/events but nothing changed | You **must reinstall** the app to your workspace after any scope or event subscription change |
+
+### Quick Checklist
+
+If the bot isn't working in channels, verify **all** of the following:
+
+1. ✅ `message.channels` event is subscribed (for public channels)
+2. ✅ `message.groups` event is subscribed (for private channels)
+3. ✅ `app_mention` event is subscribed
+4. ✅ `channels:history` scope is added (for public channels)
+5. ✅ `groups:history` scope is added (for private channels)
+6. ✅ App was **reinstalled** after adding scopes/events
+7. ✅ Bot was **invited** to the channel (`/invite @Hermes Agent`)
+8. ✅ You are **@mentioning** the bot in your message
 
 ---
 

From 64bec1d06040a503202a05538afbdb6cc8713be8 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 14:31:19 -0700
Subject: [PATCH 06/14] fix: Slack gateway setup missing event subscriptions
 and scopes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'hermes gateway setup' instructions for Slack were missing:
- The 'Subscribe to Events' step entirely (message.im, message.channels,
  app_mention, message.groups)
- Several required scopes (app_mentions:read, groups:history, users:read,
  files:write)
- Warning about bot only working in DMs without message.channels
- Step to invite the bot to channels

The 'hermes setup' flow (setup.py) and the website docs (slack.md)
already had the correct information — only gateway.py was outdated.

Reported by JordanB on Slack.
---
 hermes_cli/gateway.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py
index 64fe551bef..3d146546da 100644
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@@ -482,14 +482,19 @@ _PLATFORMS = [
         "token_var": "SLACK_BOT_TOKEN",
         "setup_instructions": [
             "1. Go to https://api.slack.com/apps → Create New App → From Scratch",
-            "2. Enable Socket Mode: App Settings → Socket Mode → Enable",
-            "3. Get Bot Token: OAuth & Permissions → Install to Workspace → copy xoxb-... token",
-            "4. Get App Token: Basic Information → App-Level Tokens → Generate",
-            "   Name it anything, add scope: connections:write → copy xapp-... token",
-            "5. Add bot scopes: OAuth & Permissions → Scopes → chat:write, im:history,",
-            "   im:read, im:write, channels:history, channels:read",
-            "6. Reinstall the app to your workspace after adding scopes",
+            "2. Enable Socket Mode: Settings → Socket Mode → Enable",
+            "   Create an App-Level Token with scope: connections:write → copy xapp-... token",
+            "3. Add Bot Token Scopes: Features → OAuth & Permissions → Scopes",
+            "   Required: chat:write, app_mentions:read, channels:history, channels:read,",
+            "   groups:history, im:history, im:read, im:write, users:read, files:write",
+            "4. Subscribe to Events: Features → Event Subscriptions → Enable",
+            "   Required events: message.im, message.channels, app_mention",
+            "   Optional: message.groups (for private channels)",
+            "   ⚠ Without message.channels the bot will ONLY work in DMs!",
+            "5. Install to Workspace: Settings → Install App → copy xoxb-... token",
+            "6. Reinstall the app after any scope or event changes",
             "7. Find your user ID: click your profile → three dots → Copy member ID",
+            "8. Invite the bot to channels: /invite @YourBot",
         ],
         "vars": [
             {"name": "SLACK_BOT_TOKEN", "prompt": "Bot Token (xoxb-...)", "password": True,

From 520aec20e06c1d11ca443f1753c25ddfe1d3d993 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 15:12:54 -0700
Subject: [PATCH 07/14] fix: add mcp to dev dependencies for test suite

MCP tests import from mcp.types but mcp wasn't in the dev optional
dependencies. Fresh 'pip install -e .[dev]' setups failed 3 tests.

Based on PR #427 by @teyrebaz33 (applied manually due to stale branch).
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5f86cabd2f..01bdaf7e23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@ dependencies = [
 [project.optional-dependencies]
 modal = ["swe-rex[modal]>=1.4.0"]
 daytona = ["daytona>=0.148.0"]
-dev = ["pytest", "pytest-asyncio"]
+dev = ["pytest", "pytest-asyncio", "mcp>=1.2.0"]
 messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0", "slack-bolt>=1.18.0", "slack-sdk>=3.27.0"]
 cron = ["croniter"]
 slack = ["slack-bolt>=1.18.0", "slack-sdk>=3.27.0"]

From fa2e72ae9c61a231445f28114b4f63f957e59dd1 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 15:29:34 -0700
Subject: [PATCH 08/14] docs: document docker_volumes config for shared host
 directories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Docker backend already supports user-configured volume mounts via
docker_volumes, but it was undocumented — missing from DEFAULT_CONFIG,
cli.py defaults, and configuration docs.

Changes:
- hermes_cli/config.py: Add docker_volumes to DEFAULT_CONFIG with
  inline documentation and examples
- cli.py: Add docker_volumes to load_cli_config defaults
- configuration.md: Full Docker Volume Mounts section with YAML
  examples, use cases (providing files, receiving outputs, shared
  workspaces), and env var alternative
---
 cli.py                                   |  1 +
 hermes_cli/config.py                     |  4 +++
 website/docs/user-guide/configuration.md | 32 ++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/cli.py b/cli.py
index 61cb8d966c..c82e85dc86 100755
--- a/cli.py
+++ b/cli.py
@@ -158,6 +158,7 @@ def load_cli_config() -> Dict[str, Any]:
             "singularity_image": "docker://python:3.11",
             "modal_image": "python:3.11",
             "daytona_image": "nikolaik/python-nodejs:python3.11-nodejs20",
+            "docker_volumes": [],  # host:container volume mounts for Docker backend
         },
         "browser": {
             "inactivity_timeout": 120,  # Auto-cleanup inactive browser sessions after 2 min
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index 7b689d764c..018ac6557f 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -77,6 +77,10 @@ DEFAULT_CONFIG = {
         "container_memory": 5120,       # MB (default 5GB)
         "container_disk": 51200,        # MB (default 50GB)
         "container_persistent": True,   # Persist filesystem across sessions
+        # Docker volume mounts — share host directories with the container.
+        # Each entry is "host_path:container_path" (standard Docker -v syntax).
+        # Example: ["/home/user/projects:/workspace/projects", "/data:/data"]
+        "docker_volumes": [],
     },
     
     "browser": {
diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md
index b600a47619..5e6f9088fb 100644
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -393,8 +393,40 @@ terminal:
   backend: local    # or: docker, ssh, singularity, modal, daytona
   cwd: "."          # Working directory ("." = current dir)
   timeout: 180      # Command timeout in seconds
+
+  # Docker-specific settings
+  docker_image: "nikolaik/python-nodejs:python3.11-nodejs20"
+  docker_volumes:                    # Share host directories with the container
+    - "/home/user/projects:/workspace/projects"
+    - "/home/user/data:/data:ro"     # :ro for read-only
+
+  # Container resource limits (docker, singularity, modal, daytona)
+  container_cpu: 1                   # CPU cores
+  container_memory: 5120             # MB (default 5GB)
+  container_disk: 51200              # MB (default 50GB)
+  container_persistent: true         # Persist filesystem across sessions
 ```
 
+### Docker Volume Mounts
+
+When using the Docker backend, `docker_volumes` lets you share host directories with the container. Each entry uses standard Docker `-v` syntax: `host_path:container_path[:options]`.
+
+```yaml
+terminal:
+  backend: docker
+  docker_volumes:
+    - "/home/user/projects:/workspace/projects"   # Read-write (default)
+    - "/home/user/datasets:/data:ro"              # Read-only
+    - "/home/user/outputs:/outputs"               # Agent writes, you read
+```
+
+This is useful for:
+- **Providing files** to the agent (datasets, configs, reference code)
+- **Receiving files** from the agent (generated code, reports, exports)
+- **Shared workspaces** where both you and the agent access the same files
+
+Can also be set via environment variable: `TERMINAL_DOCKER_VOLUMES='["/host:/container"]'` (JSON array).
+
 See [Code Execution](features/code-execution.md) and the [Terminal section of the README](features/tools.md) for details on each backend.
 
 ## Memory Configuration

From 2d44ed1c5b862ab0b674b576505b643a66fb225e Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 15:32:02 -0700
Subject: [PATCH 09/14] test: add comprehensive tests for vision_tools (42
 tests)

Covers PR #428 changes and existing vision_tools functionality:
- _validate_image_url: 20 tests for urlparse-based validation
- _determine_mime_type: 6 tests for MIME type detection
- _image_to_base64_data_url: 3 tests for base64 conversion
- _handle_vision_analyze: 5 tests for type hints, prompt building,
  AUXILIARY_VISION_MODEL env var override
- Error logging exc_info: 3 async tests verifying stack traces are
  logged on download failure, analysis error, and cleanup error
- check_vision_requirements & get_debug_session_info: 2 basic tests
- Registry integration: 3 tests for tool registration
---
 tests/tools/test_vision_tools.py | 351 +++++++++++++++++++++++++++++++
 1 file changed, 351 insertions(+)
 create mode 100644 tests/tools/test_vision_tools.py

diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py
new file mode 100644
index 0000000000..3bdd301786
--- /dev/null
+++ b/tests/tools/test_vision_tools.py
@@ -0,0 +1,351 @@
+"""Tests for tools/vision_tools.py — URL validation, type hints, error logging."""
+
+import asyncio
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Awaitable
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from tools.vision_tools import (
+    _validate_image_url,
+    _handle_vision_analyze,
+    _determine_mime_type,
+    _image_to_base64_data_url,
+    vision_analyze_tool,
+    check_vision_requirements,
+    get_debug_session_info,
+)
+
+
+# ---------------------------------------------------------------------------
+# _validate_image_url — urlparse-based validation
+# ---------------------------------------------------------------------------
+
+class TestValidateImageUrl:
+    """Tests for URL validation, including urlparse-based netloc check."""
+
+    def test_valid_https_url(self):
+        assert _validate_image_url("https://example.com/image.jpg") is True
+
+    def test_valid_http_url(self):
+        assert _validate_image_url("http://cdn.example.org/photo.png") is True
+
+    def test_valid_url_without_extension(self):
+        """CDN endpoints that redirect to images should still pass."""
+        assert _validate_image_url("https://cdn.example.com/abcdef123") is True
+
+    def test_valid_url_with_query_params(self):
+        assert _validate_image_url("https://img.example.com/pic?w=200&h=200") is True
+
+    def test_valid_url_with_port(self):
+        assert _validate_image_url("http://localhost:8080/image.png") is True
+
+    def test_valid_url_with_path_only(self):
+        assert _validate_image_url("https://example.com/") is True
+
+    def test_rejects_empty_string(self):
+        assert _validate_image_url("") is False
+
+    def test_rejects_none(self):
+        assert _validate_image_url(None) is False
+
+    def test_rejects_non_string(self):
+        assert _validate_image_url(12345) is False
+
+    def test_rejects_ftp_scheme(self):
+        assert _validate_image_url("ftp://files.example.com/image.jpg") is False
+
+    def test_rejects_file_scheme(self):
+        assert _validate_image_url("file:///etc/passwd") is False
+
+    def test_rejects_no_scheme(self):
+        assert _validate_image_url("example.com/image.jpg") is False
+
+    def test_rejects_javascript_scheme(self):
+        assert _validate_image_url("javascript:alert(1)") is False
+
+    def test_rejects_http_without_netloc(self):
+        """http:// alone has no network location — urlparse catches this."""
+        assert _validate_image_url("http://") is False
+
+    def test_rejects_https_without_netloc(self):
+        assert _validate_image_url("https://") is False
+
+    def test_rejects_http_colon_only(self):
+        assert _validate_image_url("http:") is False
+
+    def test_rejects_data_url(self):
+        assert _validate_image_url("data:image/png;base64,iVBOR") is False
+
+    def test_rejects_whitespace_only(self):
+        assert _validate_image_url("   ") is False
+
+    def test_rejects_boolean(self):
+        assert _validate_image_url(True) is False
+
+    def test_rejects_list(self):
+        assert _validate_image_url(["https://example.com"]) is False
+
+
+# ---------------------------------------------------------------------------
+# _determine_mime_type
+# ---------------------------------------------------------------------------
+
+class TestDetermineMimeType:
+    def test_jpg(self):
+        assert _determine_mime_type(Path("photo.jpg")) == "image/jpeg"
+
+    def test_jpeg(self):
+        assert _determine_mime_type(Path("photo.jpeg")) == "image/jpeg"
+
+    def test_png(self):
+        assert _determine_mime_type(Path("screenshot.png")) == "image/png"
+
+    def test_gif(self):
+        assert _determine_mime_type(Path("anim.gif")) == "image/gif"
+
+    def test_webp(self):
+        assert _determine_mime_type(Path("modern.webp")) == "image/webp"
+
+    def test_unknown_extension_defaults_to_jpeg(self):
+        assert _determine_mime_type(Path("file.xyz")) == "image/jpeg"
+
+
+# ---------------------------------------------------------------------------
+# _image_to_base64_data_url
+# ---------------------------------------------------------------------------
+
+class TestImageToBase64DataUrl:
+    def test_returns_data_url(self, tmp_path):
+        img = tmp_path / "test.png"
+        img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 8)
+        result = _image_to_base64_data_url(img)
+        assert result.startswith("data:image/png;base64,")
+
+    def test_custom_mime_type(self, tmp_path):
+        img = tmp_path / "test.bin"
+        img.write_bytes(b"\x00" * 16)
+        result = _image_to_base64_data_url(img, mime_type="image/webp")
+        assert result.startswith("data:image/webp;base64,")
+
+    def test_file_not_found_raises(self, tmp_path):
+        with pytest.raises(FileNotFoundError):
+            _image_to_base64_data_url(tmp_path / "nonexistent.png")
+
+
+# ---------------------------------------------------------------------------
+# _handle_vision_analyze — type signature & behavior
+# ---------------------------------------------------------------------------
+
+class TestHandleVisionAnalyze:
+    """Verify _handle_vision_analyze returns an Awaitable and builds correct prompt."""
+
+    def test_returns_awaitable(self):
+        """The handler must return an Awaitable (coroutine) since it's registered as async."""
+        with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool:
+            mock_tool.return_value = json.dumps({"result": "ok"})
+            result = _handle_vision_analyze(
+                {"image_url": "https://example.com/img.png", "question": "What is this?"}
+            )
+            # It should be an Awaitable (coroutine)
+            assert isinstance(result, Awaitable)
+            # Clean up the coroutine to avoid RuntimeWarning
+            result.close()
+
+    def test_prompt_contains_question(self):
+        """The full prompt should incorporate the user's question."""
+        with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool:
+            mock_tool.return_value = json.dumps({"result": "ok"})
+            coro = _handle_vision_analyze(
+                {"image_url": "https://example.com/img.png", "question": "Describe the cat"}
+            )
+            # Clean up coroutine
+            coro.close()
+            call_args = mock_tool.call_args
+            full_prompt = call_args[0][1]  # second positional arg
+            assert "Describe the cat" in full_prompt
+            assert "Fully describe and explain" in full_prompt
+
+    def test_uses_auxiliary_vision_model_env(self):
+        """AUXILIARY_VISION_MODEL env var should override DEFAULT_VISION_MODEL."""
+        with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool, \
+             patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "custom/model-v1"}):
+            mock_tool.return_value = json.dumps({"result": "ok"})
+            coro = _handle_vision_analyze(
+                {"image_url": "https://example.com/img.png", "question": "test"}
+            )
+            coro.close()
+            call_args = mock_tool.call_args
+            model = call_args[0][2]  # third positional arg
+            assert model == "custom/model-v1"
+
+    def test_falls_back_to_default_model(self):
+        """Without AUXILIARY_VISION_MODEL, should use DEFAULT_VISION_MODEL or fallback."""
+        with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool, \
+             patch.dict(os.environ, {}, clear=False):
+            # Ensure AUXILIARY_VISION_MODEL is not set
+            os.environ.pop("AUXILIARY_VISION_MODEL", None)
+            mock_tool.return_value = json.dumps({"result": "ok"})
+            coro = _handle_vision_analyze(
+                {"image_url": "https://example.com/img.png", "question": "test"}
+            )
+            coro.close()
+            call_args = mock_tool.call_args
+            model = call_args[0][2]
+            # Should be DEFAULT_VISION_MODEL or the hardcoded fallback
+            assert model is not None
+            assert len(model) > 0
+
+    def test_empty_args_graceful(self):
+        """Missing keys should default to empty strings, not raise."""
+        with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool:
+            mock_tool.return_value = json.dumps({"result": "ok"})
+            result = _handle_vision_analyze({})
+            assert isinstance(result, Awaitable)
+            result.close()
+
+
+# ---------------------------------------------------------------------------
+# Error logging with exc_info — verify tracebacks are logged
+# ---------------------------------------------------------------------------
+
+class TestErrorLoggingExcInfo:
+    """Verify that exc_info=True is used in error/warning log calls."""
+
+    @pytest.mark.asyncio
+    async def test_download_failure_logs_exc_info(self, tmp_path, caplog):
+        """After max retries, the download error should include exc_info."""
+        from tools.vision_tools import _download_image
+
+        with patch("tools.vision_tools.httpx.AsyncClient") as mock_client_cls:
+            mock_client = AsyncMock()
+            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client.__aexit__ = AsyncMock(return_value=False)
+            mock_client.get = AsyncMock(side_effect=ConnectionError("network down"))
+            mock_client_cls.return_value = mock_client
+
+            dest = tmp_path / "image.jpg"
+            with caplog.at_level(logging.ERROR, logger="tools.vision_tools"), \
+                 pytest.raises(ConnectionError):
+                await _download_image("https://example.com/img.jpg", dest, max_retries=1)
+
+            # Should have logged with exc_info (traceback present)
+            error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
+            assert len(error_records) >= 1
+            assert error_records[0].exc_info is not None
+
+    @pytest.mark.asyncio
+    async def test_analysis_error_logs_exc_info(self, caplog):
+        """When vision_analyze_tool encounters an error, it should log with exc_info."""
+        with patch("tools.vision_tools._validate_image_url", return_value=True), \
+             patch("tools.vision_tools._download_image", new_callable=AsyncMock,
+                   side_effect=Exception("download boom")), \
+             caplog.at_level(logging.ERROR, logger="tools.vision_tools"):
+
+            result = await vision_analyze_tool(
+                "https://example.com/img.jpg", "describe this", "test/model"
+            )
+            result_data = json.loads(result)
+            # Error response uses "success": False, not an "error" key
+            assert result_data["success"] is False
+
+            error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
+            assert any(r.exc_info is not None for r in error_records)
+
+    @pytest.mark.asyncio
+    async def test_cleanup_error_logs_exc_info(self, tmp_path, caplog):
+        """Temp file cleanup failure should log warning with exc_info."""
+        # Create a real temp file that will be "downloaded"
+        temp_dir = tmp_path / "temp_vision_images"
+        temp_dir.mkdir()
+
+        async def fake_download(url, dest, max_retries=3):
+            """Simulate download by writing file to the expected destination."""
+            dest.parent.mkdir(parents=True, exist_ok=True)
+            dest.write_bytes(b"\xff\xd8\xff" + b"\x00" * 16)
+            return dest
+
+        with patch("tools.vision_tools._validate_image_url", return_value=True), \
+             patch("tools.vision_tools._download_image", side_effect=fake_download), \
+             patch("tools.vision_tools._image_to_base64_data_url",
+                   return_value="data:image/jpeg;base64,abc"), \
+             patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None), \
+             patch("agent.auxiliary_client.auxiliary_max_tokens_param", return_value={"max_tokens": 2000}), \
+             caplog.at_level(logging.WARNING, logger="tools.vision_tools"):
+
+            # Mock the vision client
+            mock_client = AsyncMock()
+            mock_response = MagicMock()
+            mock_choice = MagicMock()
+            mock_choice.message.content = "A test image description"
+            mock_response.choices = [mock_choice]
+            mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
+
+            # Patch module-level _aux_async_client so the tool doesn't bail early
+            with patch("tools.vision_tools._aux_async_client", mock_client), \
+                 patch("tools.vision_tools.DEFAULT_VISION_MODEL", "test/model"):
+
+                # Make unlink fail to trigger cleanup warning
+                original_unlink = Path.unlink
+                def failing_unlink(self, *args, **kwargs):
+                    raise PermissionError("no permission")
+
+                with patch.object(Path, "unlink", failing_unlink):
+                    result = await vision_analyze_tool(
+                        "https://example.com/tempimg.jpg", "describe", "test/model"
+                    )
+
+            warning_records = [r for r in caplog.records if r.levelno == logging.WARNING
+                               and "temporary file" in r.getMessage().lower()]
+            assert len(warning_records) >= 1
+            assert warning_records[0].exc_info is not None
+
+
+# ---------------------------------------------------------------------------
+# check_vision_requirements & get_debug_session_info
+# ---------------------------------------------------------------------------
+
+class TestVisionRequirements:
+    def test_check_requirements_returns_bool(self):
+        result = check_vision_requirements()
+        assert isinstance(result, bool)
+
+    def test_debug_session_info_returns_dict(self):
+        info = get_debug_session_info()
+        assert isinstance(info, dict)
+        # DebugSession.get_session_info() returns these keys
+        assert "enabled" in info
+        assert "session_id" in info
+        assert "total_calls" in info
+
+
+# ---------------------------------------------------------------------------
+# Integration: registry entry
+# ---------------------------------------------------------------------------
+
+class TestVisionRegistration:
+    def test_vision_analyze_registered(self):
+        from tools.registry import registry
+        entry = registry._tools.get("vision_analyze")
+        assert entry is not None
+        assert entry.toolset == "vision"
+        assert entry.is_async is True
+
+    def test_schema_has_required_fields(self):
+        from tools.registry import registry
+        entry = registry._tools.get("vision_analyze")
+        schema = entry.schema
+        assert schema["name"] == "vision_analyze"
+        params = schema.get("parameters", {})
+        props = params.get("properties", {})
+        assert "image_url" in props
+        assert "question" in props
+
+    def test_handler_is_callable(self):
+        from tools.registry import registry
+        entry = registry._tools.get("vision_analyze")
+        assert callable(entry.handler)

From ef5d811abac69725208a90062f2da6ac502ef3ea Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 15:36:19 -0700
Subject: [PATCH 10/14] fix: vision auto-detection now falls back to
 custom/local endpoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vision auto-mode previously only tried OpenRouter, Nous, and Codex
for multimodal — deliberately skipping custom endpoints with the
assumption they 'may not handle vision input.' This caused silent
failures for users running local multimodal models (Qwen-VL, LLaVA,
Pixtral, etc.) without any cloud API keys.

Now custom endpoints are tried as a last resort in auto mode. If the
model doesn't support vision, the API call fails gracefully — but
users with local vision models no longer need to manually set
auxiliary.vision.provider: main in config.yaml.

Reported by @Spadav and @kotyKD.
---
 agent/auxiliary_client.py            | 10 +++++++---
 tests/agent/test_auxiliary_client.py | 14 +++++++++-----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index a32e3a2937..57c3c11869 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -560,12 +560,16 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
     forced = _get_auxiliary_provider("vision")
     if forced != "auto":
         return _resolve_forced_provider(forced)
-    # Auto: only multimodal-capable providers
-    for try_fn in (_try_openrouter, _try_nous, _try_codex):
+    # Auto: try providers known to support multimodal first, then fall
+    # back to the user's custom endpoint.  Many local models (Qwen-VL,
+    # LLaVA, Pixtral, etc.) support vision — skipping them entirely
+    # caused silent failures for local-only users.
+    for try_fn in (_try_openrouter, _try_nous, _try_codex,
+                   _try_custom_endpoint):
         client, model = try_fn()
         if client is not None:
             return client, model
-    logger.debug("Auxiliary vision client: none available (auto only tries OpenRouter/Nous/Codex)")
+    logger.debug("Auxiliary vision client: none available")
     return None, None
 
 
diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py
index 66187d0554..299d083f20 100644
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@@ -176,14 +176,18 @@ class TestVisionClientFallback:
         assert isinstance(client, CodexAuxiliaryClient)
         assert model == "gpt-5.3-codex"
 
-    def test_vision_auto_skips_custom_endpoint(self, monkeypatch):
-        """Custom endpoint is skipped in vision auto mode."""
+    def test_vision_auto_falls_back_to_custom_endpoint(self, monkeypatch):
+        """Custom endpoint is used as fallback in vision auto mode.
+
+        Many local models (Qwen-VL, LLaVA, etc.) support vision.
+        When no OpenRouter/Nous/Codex is available, try the custom endpoint.
+        """
         monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:1234/v1")
         monkeypatch.setenv("OPENAI_API_KEY", "local-key")
-        with patch("agent.auxiliary_client._read_nous_auth", return_value=None):
+        with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
+             patch("agent.auxiliary_client.OpenAI") as mock_openai:
             client, model = get_vision_auxiliary_client()
-        assert client is None
-        assert model is None
+        assert client is not None  # Custom endpoint picked up as fallback
 
     def test_vision_uses_openrouter_when_available(self, monkeypatch):
         monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")

From 9abd6bf342aa9e05339df53826b11610d102b39a Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 17:24:00 -0700
Subject: [PATCH 11/14] fix: gateway missing docker_volumes config bridge +
 list serialization bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gateway's config.yaml → env var bridge was missing docker_volumes,
so Docker volume mounts configured in config.yaml were ignored for
gateway sessions (Telegram, Discord, etc.) while working in CLI.

Also fixes list serialization: str() produces Python repr with single
quotes which json.loads() in terminal_tool.py can't parse. Now uses
json.dumps() for list values.

Based on PR #431 by @manuelschipper (applied manually due to stale branch).
---
 gateway/run.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/gateway/run.py b/gateway/run.py
index 2584521d12..6dd1a280a5 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -75,11 +75,16 @@ if _config_path.exists():
                 "container_memory": "TERMINAL_CONTAINER_MEMORY",
                 "container_disk": "TERMINAL_CONTAINER_DISK",
                 "container_persistent": "TERMINAL_CONTAINER_PERSISTENT",
+                "docker_volumes": "TERMINAL_DOCKER_VOLUMES",
                 "sandbox_dir": "TERMINAL_SANDBOX_DIR",
             }
             for _cfg_key, _env_var in _terminal_env_map.items():
                 if _cfg_key in _terminal_cfg:
-                    os.environ[_env_var] = str(_terminal_cfg[_cfg_key])
+                    _val = _terminal_cfg[_cfg_key]
+                    if isinstance(_val, list):
+                        os.environ[_env_var] = json.dumps(_val)
+                    else:
+                        os.environ[_env_var] = str(_val)
         _compression_cfg = _cfg.get("compression", {})
         if _compression_cfg and isinstance(_compression_cfg, dict):
             _compression_env_map = {

From 5212644861ffefe2a51b259692da564cf0d4aab7 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 17:33:19 -0700
Subject: [PATCH 12/14] fix(security): prevent shell injection in
 tilde-username path expansion

Validate that the username portion of ~username paths contains only
valid characters (alphanumeric, dot, hyphen, underscore) before passing
to shell echo for expansion. Previously, paths like '~; rm -rf /'
would be passed unquoted to self._exec(f'echo {path}'), allowing
arbitrary command execution.

The approach validates the username rather than using shlex.quote(),
which would prevent tilde expansion from working at all since
echo '~user' outputs the literal string instead of expanding it.

Added tests for injection blocking and valid ~username/path expansion.

Credit to @alireza78a for reporting (PR #442, issue #442).
---
 tests/tools/test_file_tools_live.py | 19 +++++++++++++++++++
 tools/file_operations.py            | 14 ++++++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/tests/tools/test_file_tools_live.py b/tests/tools/test_file_tools_live.py
index 426b3543bb..72efbb2375 100644
--- a/tests/tools/test_file_tools_live.py
+++ b/tests/tools/test_file_tools_live.py
@@ -505,6 +505,25 @@ class TestExpandPath:
         assert result == str(Path.home())
         _assert_clean(result)
 
+    def test_tilde_injection_blocked(self, ops):
+        """Paths like ~; rm -rf / must NOT execute shell commands."""
+        malicious = "~; echo PWNED > /tmp/_hermes_injection_test"
+        result = ops._expand_path(malicious)
+        # The invalid username (contains ";") should prevent shell expansion.
+        # The path should be returned as-is (no expansion).
+        assert result == malicious
+        # Verify the injected command did NOT execute
+        import os
+        assert not os.path.exists("/tmp/_hermes_injection_test")
+
+    def test_tilde_username_with_subpath(self, ops):
+        """~root/file.txt should attempt expansion (valid username)."""
+        result = ops._expand_path("~root/file.txt")
+        # On most systems ~root expands to /root
+        if result != "~root/file.txt":
+            assert result.endswith("/file.txt")
+            assert "~" not in result
+
 
 # ── Terminal output cleanliness ──────────────────────────────────────────
 
diff --git a/tools/file_operations.py b/tools/file_operations.py
index 3f72c5fdb1..b3b8f15309 100644
--- a/tools/file_operations.py
+++ b/tools/file_operations.py
@@ -400,10 +400,16 @@ class ShellFileOperations(FileOperations):
                     return home
                 elif path.startswith('~/'):
                     return home + path[1:]  # Replace ~ with home
-                # ~username format - let shell expand it
-                expand_result = self._exec(f"echo {path}")
-                if expand_result.exit_code == 0:
-                    return expand_result.stdout.strip()
+                # ~username format - extract and validate username before
+                # letting shell expand it (prevent shell injection via
+                # paths like "~; rm -rf /").
+                rest = path[1:]  # strip leading ~
+                slash_idx = rest.find('/')
+                username = rest[:slash_idx] if slash_idx >= 0 else rest
+                if username and re.fullmatch(r'[a-zA-Z0-9._-]+', username):
+                    expand_result = self._exec(f"echo {path}")
+                    if expand_result.exit_code == 0 and expand_result.stdout.strip():
+                        return expand_result.stdout.strip()
         
         return path
     

From 8eabdefa8ac26b2ae799882c37bea91a50296d6e Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 17:45:50 -0700
Subject: [PATCH 13/14] fix: bring WebResearchEnv up to Atropos environment
 standards
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The environment was merged missing several standard components.
Updated to match the patterns established by 82 Atropos environments
and our own HermesAgentBaseEnv contract.

Added:
- WebResearchEnvConfig — custom Pydantic config with reward weights,
  efficiency thresholds, eval settings, dataset config (all tunable
  via CLI/YAML without code changes)
- config_init() classmethod — default server config (OpenRouter +
  Claude) so the env works out of the box
- wandb_log() override — logs reward breakdown metrics (correctness,
  tool_usage, efficiency, diversity, correct_rate, tool_usage_rate)
  with proper buffer management and super() call
- evaluate() — uses server.chat_completion instead of broken stub
  _run_agent_on_item(). Logs via evaluate_log() for lighteval-
  compatible output.

Fixed:
- Removed broken _run_agent_on_item() stub that returned empty results
- evaluate() now uses server.chat_completion (same pattern as
  TerminalTestEnv) for actual model evaluation
- compute_reward reads tool calls from AgentResult properly
- LLM judge uses self.server.chat_completion instead of ctx

Reward config is now tunable without code changes:
  --env.correctness_weight 0.6
  --env.tool_usage_weight 0.2
  --env.efficiency_weight 0.2
  --env.diversity_bonus 0.1
  --env.efficient_max_calls 5
---
 environments/web_research_env.py | 414 ++++++++++++++++++++-----------
 1 file changed, 270 insertions(+), 144 deletions(-)

diff --git a/environments/web_research_env.py b/environments/web_research_env.py
index e73eb45c6d..a868cd034e 100644
--- a/environments/web_research_env.py
+++ b/environments/web_research_env.py
@@ -16,21 +16,18 @@ Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions
 
 Usage:
     # Phase 1 (OpenAI-compatible server)
-    python environments/web_research_env.py serve \
-        --openai.base_url http://localhost:8000/v1 \
-        --openai.model_name YourModel \
+    python environments/web_research_env.py serve \\
+        --openai.base_url http://localhost:8000/v1 \\
+        --openai.model_name YourModel \\
         --openai.server_type openai
 
-    # With eval split
-    python environments/web_research_env.py serve \
-        --openai.base_url http://localhost:8000/v1 \
-        --openai.model_name YourModel \
-        --env.eval_every 50 \
-        --env.eval_size 20
+    # Process mode (offline data generation)
+    python environments/web_research_env.py process \\
+        --env.data_path_to_save_groups data/web_research.jsonl
 
-    # Standalone eval (no training server needed)
-    python environments/web_research_env.py eval \
-        --openai.base_url http://localhost:8000/v1 \
+    # Standalone eval
+    python environments/web_research_env.py evaluate \\
+        --openai.base_url http://localhost:8000/v1 \\
         --openai.model_name YourModel
 
 Built by: github.com/jackx707
@@ -43,11 +40,21 @@ from __future__ import annotations
 import asyncio
 import json
 import logging
+import os
 import random
 import re
-from typing import Any, Optional
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import urlparse
 
+from pydantic import Field
+
+# Ensure hermes-agent root is on path
+_repo_root = Path(__file__).resolve().parent.parent
+if str(_repo_root) not in sys.path:
+    sys.path.insert(0, str(_repo_root))
+
 # ---------------------------------------------------------------------------
 # Optional HuggingFace datasets import
 # ---------------------------------------------------------------------------
@@ -57,13 +64,19 @@ try:
 except ImportError:
     HF_AVAILABLE = False
 
-from environments.hermes_base_env import HermesAgentBaseEnv
+from atroposlib.envs.base import ScoredDataGroup
+from atroposlib.envs.server_handling.server_manager import APIServerConfig
+from atroposlib.type_definitions import Item
+
+from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
+from environments.agent_loop import AgentResult
+from environments.tool_context import ToolContext
 
 logger = logging.getLogger(__name__)
 
 # ---------------------------------------------------------------------------
 # Fallback sample dataset (used when HuggingFace is unavailable)
-# These are multi-hop questions that require real web search to answer.
+# Multi-hop questions requiring real web search to answer.
 # ---------------------------------------------------------------------------
 SAMPLE_QUESTIONS = [
     {
@@ -129,6 +142,58 @@ SAMPLE_QUESTIONS = [
 ]
 
 
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+class WebResearchEnvConfig(HermesAgentEnvConfig):
+    """Configuration for the web research RL environment."""
+
+    # Reward weights
+    correctness_weight: float = Field(
+        default=0.6,
+        description="Weight for answer correctness in reward (LLM judge score).",
+    )
+    tool_usage_weight: float = Field(
+        default=0.2,
+        description="Weight for tool usage signal (did the model actually use web tools?).",
+    )
+    efficiency_weight: float = Field(
+        default=0.2,
+        description="Weight for efficiency signal (penalizes excessive tool calls).",
+    )
+    diversity_bonus: float = Field(
+        default=0.1,
+        description="Bonus reward for citing ≥2 distinct domains.",
+    )
+
+    # Efficiency thresholds
+    efficient_max_calls: int = Field(
+        default=5,
+        description="Maximum tool calls before efficiency penalty begins.",
+    )
+    heavy_penalty_calls: int = Field(
+        default=10,
+        description="Tool call count where efficiency penalty steepens.",
+    )
+
+    # Eval
+    eval_size: int = Field(
+        default=20,
+        description="Number of held-out items for evaluation.",
+    )
+    eval_split_ratio: float = Field(
+        default=0.1,
+        description="Fraction of dataset to hold out for evaluation (0.0–1.0).",
+    )
+
+    # Dataset
+    dataset_name: str = Field(
+        default="google/frames-benchmark",
+        description="HuggingFace dataset name for research questions.",
+    )
+
+
 # ---------------------------------------------------------------------------
 # Environment
 # ---------------------------------------------------------------------------
@@ -143,23 +208,60 @@ class WebResearchEnv(HermesAgentBaseEnv):
     Reward is multi-signal:
       60% — answer correctness (LLM judge)
       20% — tool usage (did the model actually search the web?)
-      20% — efficiency (penalizes >6 tool calls)
+      20% — efficiency (penalizes >5 tool calls)
 
     Bonus +0.1 for source diversity (≥2 distinct domains cited).
     """
 
     name = "web-research"
+    env_config_cls = WebResearchEnvConfig
 
     # Default toolsets for this environment — web + file for saving notes
     default_toolsets = ["web", "file"]
 
+    @classmethod
+    def config_init(cls) -> Tuple[WebResearchEnvConfig, List[APIServerConfig]]:
+        """Default configuration for the web research environment."""
+        env_config = WebResearchEnvConfig(
+            enabled_toolsets=["web", "file"],
+            max_agent_turns=15,
+            agent_temperature=1.0,
+            system_prompt=(
+                "You are a highly capable research agent. When asked a factual question, "
+                "always use web_search to find current, accurate information before answering. "
+                "Cite at least 2 sources. Be concise and accurate."
+            ),
+            group_size=4,
+            total_steps=1000,
+            steps_per_eval=100,
+            use_wandb=True,
+            wandb_name="web-research",
+        )
+
+        server_configs = [
+            APIServerConfig(
+                base_url="https://openrouter.ai/api/v1",
+                model_name="anthropic/claude-sonnet-4.5",
+                server_type="openai",
+                api_key=os.getenv("OPENROUTER_API_KEY", ""),
+                health_check=False,
+            )
+        ]
+
+        return env_config, server_configs
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._items: list[dict] = []
         self._eval_items: list[dict] = []
         self._index: int = 0
-        self._total_scored: int = 0
-        self._total_reward: float = 0.0
+
+        # Metrics tracking for wandb
+        self._reward_buffer: list[float] = []
+        self._correctness_buffer: list[float] = []
+        self._tool_usage_buffer: list[float] = []
+        self._efficiency_buffer: list[float] = []
+        self._diversity_buffer: list[float] = []
 
     # ------------------------------------------------------------------
     # 1. Setup — load dataset
@@ -170,7 +272,7 @@ class WebResearchEnv(HermesAgentBaseEnv):
         if HF_AVAILABLE:
             try:
                 logger.info("Loading FRAMES benchmark from HuggingFace...")
-                ds = load_dataset("google/frames-benchmark", split="test")
+                ds = load_dataset(self.config.dataset_name, split="test")
                 self._items = [
                     {
                         "question": row["Prompt"],
@@ -180,8 +282,11 @@ class WebResearchEnv(HermesAgentBaseEnv):
                     }
                     for row in ds
                 ]
-                # Hold out 10% for eval
-                eval_size = max(20, len(self._items) // 10)
+                # Hold out for eval
+                eval_size = max(
+                    self.config.eval_size,
+                    int(len(self._items) * self.config.eval_split_ratio),
+                )
                 random.shuffle(self._items)
                 self._eval_items = self._items[:eval_size]
                 self._items = self._items[eval_size:]
@@ -220,10 +325,7 @@ class WebResearchEnv(HermesAgentBaseEnv):
     # ------------------------------------------------------------------
 
     def format_prompt(self, item: dict) -> str:
-        """
-        Format the research question as a task prompt.
-        Instructs the model to use web search and cite sources.
-        """
+        """Format the research question as a task prompt."""
         return (
             f"Research the following question thoroughly using web search. "
             f"You MUST search the web to find current, accurate information — "
@@ -243,27 +345,30 @@ class WebResearchEnv(HermesAgentBaseEnv):
     async def compute_reward(
         self,
         item: dict,
-        result: dict,
-        ctx: Any,  # ToolContext
+        result: AgentResult,
+        ctx: ToolContext,
     ) -> float:
         """
         Multi-signal reward function:
 
-          0.6 * correctness   — LLM judge comparing answer to ground truth
-          0.2 * tool_used     — binary: did the model use web tools?
-          0.2 * efficiency    — penalizes wasteful tool usage
-          +0.1 bonus          — source diversity (≥2 distinct domains)
+          correctness_weight * correctness  — LLM judge comparing answer to ground truth
+          tool_usage_weight  * tool_used    — binary: did the model use web tools?
+          efficiency_weight  * efficiency   — penalizes wasteful tool usage
+          + diversity_bonus                 — source diversity (≥2 distinct domains)
         """
-        final_response: str = result.get("final_response", "")
-        tools_used: list[str] = result.get("tools_used", [])
-        tool_call_count: int = result.get("tool_call_count", len(tools_used))
+        final_response: str = result.final_response or ""
+        tools_used: list[str] = [
+            tc.tool_name for tc in (result.tool_calls or [])
+        ] if hasattr(result, "tool_calls") and result.tool_calls else []
+        tool_call_count: int = result.turns_used or len(tools_used)
+
+        cfg = self.config
 
         # ---- Signal 1: Answer correctness (LLM judge) ----------------
         correctness = await self._llm_judge(
             question=item["question"],
             expected=item["answer"],
             model_answer=final_response,
-            ctx=ctx,
         )
 
         # ---- Signal 2: Web tool usage --------------------------------
@@ -271,35 +376,37 @@ class WebResearchEnv(HermesAgentBaseEnv):
         tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0
 
         # ---- Signal 3: Efficiency ------------------------------------
-        # Ideal: 2-5 tool calls. Penalise beyond 6, hard cap at 15.
-        if tool_call_count <= 5:
+        if tool_call_count <= cfg.efficient_max_calls:
             efficiency = 1.0
-        elif tool_call_count <= 10:
-            efficiency = 1.0 - (tool_call_count - 5) * 0.08
+        elif tool_call_count <= cfg.heavy_penalty_calls:
+            efficiency = 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.08
         else:
-            efficiency = max(0.0, 1.0 - (tool_call_count - 5) * 0.12)
+            efficiency = max(0.0, 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.12)
 
         # ---- Bonus: Source diversity ---------------------------------
         domains = self._extract_domains(final_response)
-        diversity_bonus = 0.1 if len(domains) >= 2 else 0.0
+        diversity = cfg.diversity_bonus if len(domains) >= 2 else 0.0
 
         # ---- Combine ------------------------------------------------
         reward = (
-            0.6 * correctness
-            + 0.2 * tool_used
-            + 0.2 * efficiency
-            + diversity_bonus
+            cfg.correctness_weight * correctness
+            + cfg.tool_usage_weight * tool_used
+            + cfg.efficiency_weight * efficiency
+            + diversity
         )
         reward = min(1.0, max(0.0, reward))  # clamp to [0, 1]
 
-        # Track running stats
-        self._total_scored += 1
-        self._total_reward += reward
+        # Track for wandb
+        self._reward_buffer.append(reward)
+        self._correctness_buffer.append(correctness)
+        self._tool_usage_buffer.append(tool_used)
+        self._efficiency_buffer.append(efficiency)
+        self._diversity_buffer.append(diversity)
 
         logger.debug(
             f"Reward breakdown — correctness={correctness:.2f}, "
             f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, "
-            f"diversity_bonus={diversity_bonus:.1f} → total={reward:.3f}"
+            f"diversity={diversity:.1f} → total={reward:.3f}"
         )
 
         return reward
@@ -308,68 +415,117 @@ class WebResearchEnv(HermesAgentBaseEnv):
     # 5. evaluate — run on held-out eval split
     # ------------------------------------------------------------------
 
-    async def evaluate(
-        self,
-        *args: Any,
-        eval_size: Optional[int] = None,
-        **kwargs: Any,
-    ) -> dict:
-        """
-        Run evaluation on the held-out split.
-        Returns a dict of metrics for logging.
-        """
-        items = self._eval_items
-        if eval_size:
-            items = items[:eval_size]
+    async def evaluate(self, *args, **kwargs) -> None:
+        """Run evaluation on the held-out split using the agent loop."""
+        import time
 
+        items = self._eval_items
         if not items:
             logger.warning("No eval items available.")
-            return {}
+            return
 
-        logger.info(f"Running eval on {len(items)} questions...")
+        eval_size = min(self.config.eval_size, len(items))
+        eval_items = items[:eval_size]
 
-        rewards = []
-        correctness_scores = []
+        logger.info(f"Running eval on {len(eval_items)} questions...")
+        start_time = time.time()
+        samples = []
 
-        for item in items:
+        for item in eval_items:
             try:
-                # Run the agent on each eval question
-                result = await self._run_agent_on_item(item)
-                reward = await self.compute_reward(item, result, ctx=None)
-                rewards.append(reward)
+                # Use the base env's agent loop for eval (same as training)
+                prompt = self.format_prompt(item)
+                completion = await self.server.chat_completion(
+                    messages=[
+                        {"role": "system", "content": self.config.system_prompt or ""},
+                        {"role": "user", "content": prompt},
+                    ],
+                    n=1,
+                    max_tokens=self.config.max_token_length,
+                    temperature=0.0,
+                    split="eval",
+                )
+
+                response_content = (
+                    completion.choices[0].message.content if completion.choices else ""
+                )
+
+                # Score the response
+                correctness = await self._llm_judge(
+                    question=item["question"],
+                    expected=item["answer"],
+                    model_answer=response_content,
+                )
+
+                samples.append({
+                    "prompt": item["question"],
+                    "response": response_content,
+                    "expected": item["answer"],
+                    "correctness": correctness,
+                })
 
-                # Also track raw correctness separately
-                if result.get("final_response"):
-                    correctness_scores.append(
-                        await self._llm_judge(
-                            question=item["question"],
-                            expected=item["answer"],
-                            model_answer=result["final_response"],
-                            ctx=None,
-                        )
-                    )
             except Exception as e:
                 logger.error(f"Eval error on item: {e}")
-                rewards.append(0.0)
+                samples.append({
+                    "prompt": item["question"],
+                    "response": f"ERROR: {e}",
+                    "expected": item["answer"],
+                    "correctness": 0.0,
+                })
 
-        metrics = {
-            "eval/mean_reward": sum(rewards) / len(rewards) if rewards else 0.0,
+        end_time = time.time()
+
+        # Compute metrics
+        correctness_scores = [s["correctness"] for s in samples]
+        eval_metrics = {
             "eval/mean_correctness": (
                 sum(correctness_scores) / len(correctness_scores)
                 if correctness_scores else 0.0
             ),
-            "eval/n_items": len(rewards),
-            "train/mean_reward_so_far": (
-                self._total_reward / self._total_scored
-                if self._total_scored > 0 else 0.0
-            ),
+            "eval/n_items": len(samples),
         }
 
-        logger.info(
-            f"Eval complete — mean_reward={metrics['eval/mean_reward']:.3f}, "
-            f"mean_correctness={metrics['eval/mean_correctness']:.3f}"
+        await self.evaluate_log(
+            metrics=eval_metrics,
+            samples=samples,
+            start_time=start_time,
+            end_time=end_time,
         )
-        return metrics
+
+    # ------------------------------------------------------------------
+    # 6. wandb_log — custom metrics
+    # ------------------------------------------------------------------
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None) -> None:
+        """Log reward breakdown metrics to wandb."""
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        if self._reward_buffer:
+            n = len(self._reward_buffer)
+            wandb_metrics["train/mean_reward"] = sum(self._reward_buffer) / n
+            wandb_metrics["train/mean_correctness"] = sum(self._correctness_buffer) / n
+            wandb_metrics["train/mean_tool_usage"] = sum(self._tool_usage_buffer) / n
+            wandb_metrics["train/mean_efficiency"] = sum(self._efficiency_buffer) / n
+            wandb_metrics["train/mean_diversity"] = sum(self._diversity_buffer) / n
+            wandb_metrics["train/total_rollouts"] = n
+
+            # Accuracy buckets
+            wandb_metrics["train/correct_rate"] = (
+                sum(1 for c in self._correctness_buffer if c >= 0.7) / n
+            )
+            wandb_metrics["train/tool_usage_rate"] = (
+                sum(1 for t in self._tool_usage_buffer if t > 0) / n
+            )
+
+            # Clear buffers
+            self._reward_buffer.clear()
+            self._correctness_buffer.clear()
+            self._tool_usage_buffer.clear()
+            self._efficiency_buffer.clear()
+            self._diversity_buffer.clear()
+
+        await super().wandb_log(wandb_metrics)
 
     # ------------------------------------------------------------------
     # Private helpers
@@ -380,19 +536,14 @@ class WebResearchEnv(HermesAgentBaseEnv):
         question: str,
         expected: str,
         model_answer: str,
-        ctx: Any,
     ) -> float:
         """
-        Use an LLM to judge whether `model_answer` correctly addresses
-        `question` compared to `expected`. Returns a float in [0, 1].
-
-        Uses the agent's own inference client if ctx is available,
-        otherwise falls back to a lightweight heuristic.
+        Use the server's LLM to judge answer correctness.
+        Falls back to keyword heuristic if LLM call fails.
         """
         if not model_answer or not model_answer.strip():
             return 0.0
 
-        # Build judge prompt
         judge_prompt = (
             "You are an impartial judge evaluating the quality of an AI research answer.\n\n"
             f"Question: {question}\n\n"
@@ -405,39 +556,36 @@ class WebResearchEnv(HermesAgentBaseEnv):
             "  0.1 = mentions relevant topic but wrong or very incomplete\n"
             "  0.0 = completely wrong or no answer\n\n"
             "Consider: factual accuracy, completeness, and relevance.\n"
-            "Respond with ONLY a JSON object: {\"score\": <float>, \"reason\": \"<one sentence>\"}"
+            'Respond with ONLY a JSON object: {"score": <float>, "reason": "<one sentence>"}'
         )
 
-        # Try using ctx for inference (Phase 2 / live training)
-        if ctx is not None and hasattr(ctx, "chat_completion"):
-            try:
-                response = await ctx.chat_completion(
-                    messages=[{"role": "user", "content": judge_prompt}],
-                    max_tokens=100,
-                    temperature=0.0,
-                )
-                text = response.get("content", "")
-                parsed = self._parse_judge_json(text)
-                if parsed is not None:
-                    return float(parsed)
-            except Exception as e:
-                logger.debug(f"LLM judge via ctx failed: {e}. Using heuristic.")
+        try:
+            response = await self.server.chat_completion(
+                messages=[{"role": "user", "content": judge_prompt}],
+                n=1,
+                max_tokens=150,
+                temperature=0.0,
+                split="eval",
+            )
+            text = response.choices[0].message.content if response.choices else ""
+            parsed = self._parse_judge_json(text)
+            if parsed is not None:
+                return float(parsed)
+        except Exception as e:
+            logger.debug(f"LLM judge failed: {e}. Using heuristic.")
 
-        # Fallback: keyword overlap heuristic
         return self._heuristic_score(expected, model_answer)
 
     @staticmethod
     def _parse_judge_json(text: str) -> Optional[float]:
         """Extract the score float from LLM judge JSON response."""
         try:
-            # Strip markdown code fences if present
             clean = re.sub(r"```(?:json)?|```", "", text).strip()
             data = json.loads(clean)
             score = float(data.get("score", -1))
             if 0.0 <= score <= 1.0:
                 return score
         except Exception:
-            # Try regex fallback
             match = re.search(r'"score"\s*:\s*([0-9.]+)', text)
             if match:
                 score = float(match.group(1))
@@ -447,10 +595,7 @@ class WebResearchEnv(HermesAgentBaseEnv):
 
     @staticmethod
     def _heuristic_score(expected: str, model_answer: str) -> float:
-        """
-        Lightweight keyword overlap score as fallback when no LLM is available.
-        Extracts meaningful tokens and computes Jaccard similarity.
-        """
+        """Lightweight keyword overlap score as fallback."""
         stopwords = {
             "the", "a", "an", "is", "are", "was", "were", "of", "in", "on",
             "at", "to", "for", "with", "and", "or", "but", "it", "its",
@@ -458,35 +603,30 @@ class WebResearchEnv(HermesAgentBaseEnv):
         }
 
         def tokenize(text: str) -> set:
-            tokens = re.findall(r'\b[a-zA-Z0-9]+\b', text.lower())
+            tokens = re.findall(r'\b\w+\b', text.lower())
             return {t for t in tokens if t not in stopwords and len(t) > 2}
 
         expected_tokens = tokenize(expected)
         answer_tokens = tokenize(model_answer)
 
         if not expected_tokens:
-            return 0.5  # Can't judge
+            return 0.5
 
         overlap = len(expected_tokens & answer_tokens)
         union = len(expected_tokens | answer_tokens)
 
         jaccard = overlap / union if union > 0 else 0.0
-        # Recall-weighted: reward covering expected content
         recall = overlap / len(expected_tokens)
         return min(1.0, 0.4 * jaccard + 0.6 * recall)
 
     @staticmethod
     def _extract_domains(text: str) -> set:
-        """
-        Extract unique domains from URLs cited in the response.
-        Used to measure source diversity.
-        """
+        """Extract unique domains from URLs cited in the response."""
         urls = re.findall(r'https?://[^\s\)>\]"\']+', text)
         domains = set()
         for url in urls:
             try:
                 parsed = urlparse(url)
-                # Normalize: strip www.
                 domain = parsed.netloc.lower().lstrip("www.")
                 if domain:
                     domains.add(domain)
@@ -494,20 +634,6 @@ class WebResearchEnv(HermesAgentBaseEnv):
                 pass
         return domains
 
-    async def _run_agent_on_item(self, item: dict) -> dict:
-        """
-        Stub for running agent during eval. In Phase 1/2, this is handled
-        by the Atropos framework's rollout mechanism. Provided here for
-        standalone eval compatibility.
-        """
-        # In real usage, the framework calls get_next_item + format_prompt
-        # and runs the agent. This stub returns an empty result for safety.
-        return {
-            "final_response": "",
-            "tools_used": [],
-            "tool_call_count": 0,
-        }
-
 
 # ---------------------------------------------------------------------------
 # Entry point

From 172a38c344a372296ea995258d2251be4245ba04 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 9 Mar 2026 17:52:33 -0700
Subject: [PATCH 14/14] fix: Docker persistent bind mounts fail with Permission
 denied
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cap-drop ALL removes DAC_OVERRIDE, which root needs to write to
bind-mounted directories owned by the host user (uid 1000). This
broke persistent Docker sandboxes — the container couldn't write
to /workspace or /root.

Add back the minimum capabilities needed:
- DAC_OVERRIDE: root can write to bind-mounted dirs owned by host user
- CHOWN: package managers (pip, npm, apt) need to set file ownership
- FOWNER: needed for operations on files owned by other users

Still drops all other capabilities (NET_RAW, SYS_ADMIN, etc.) and
keeps no-new-privileges. Security boundary is the container itself.

Verified end-to-end: create files → destroy container → new container
with same task_id → files persist on host and are accessible in the
new container.
---
 tools/environments/docker.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/environments/docker.py b/tools/environments/docker.py
index 85184fde7c..faf01b2a25 100644
--- a/tools/environments/docker.py
+++ b/tools/environments/docker.py
@@ -22,10 +22,16 @@ logger = logging.getLogger(__name__)
 
 # Security flags applied to every container.
 # The container itself is the security boundary (isolated from host).
-# We drop all capabilities, block privilege escalation, and limit PIDs.
+# We drop all capabilities then add back the minimum needed:
+#   DAC_OVERRIDE - root can write to bind-mounted dirs owned by host user
+#   CHOWN/FOWNER - package managers (pip, npm, apt) need to set file ownership
+# Block privilege escalation and limit PIDs.
 # /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds).
 _SECURITY_ARGS = [
     "--cap-drop", "ALL",
+    "--cap-add", "DAC_OVERRIDE",
+    "--cap-add", "CHOWN",
+    "--cap-add", "FOWNER",
     "--security-opt", "no-new-privileges",
     "--pids-limit", "256",
     "--tmpfs", "/tmp:rw,nosuid,size=512m",