Merge branch 'main' into pr-635

2026-04-30 01:41:43 +00:00 · 2026-03-07 20:36:42 -08:00 · 2026-03-07 20:36:42 -08:00 · c5a9d1ef9d
commit c5a9d1ef9d
parent 4447e7d71a c7b6f423c7
5 changed files with 344 additions and 2 deletions
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2025 Nous Research
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/gateway/run.py
+++ b/gateway/run.py
@ -831,6 +831,167 @@ class GatewayRunner:
        # Load conversation history from transcript
        history = self.session_store.load_transcript(session_entry.session_id)
        # -----------------------------------------------------------------
        # Session hygiene: auto-compress pathologically large transcripts
        #
        # Long-lived gateway sessions can accumulate enough history that
        # every new message rehydrates an oversized transcript, causing
        # repeated truncation/context failures.  Detect this early and
        # compress proactively — before the agent even starts.  (#628)
        # -----------------------------------------------------------------
        if history and len(history) >= 4:
            from agent.model_metadata import estimate_messages_tokens_rough
            # Read thresholds from config.yaml → session_hygiene section
            _hygiene_cfg = {}
            try:
                _hyg_cfg_path = _hermes_home / "config.yaml"
                if _hyg_cfg_path.exists():
                    import yaml as _hyg_yaml
                    with open(_hyg_cfg_path) as _hyg_f:
                        _hyg_data = _hyg_yaml.safe_load(_hyg_f) or {}
                    _hygiene_cfg = _hyg_data.get("session_hygiene", {})
                    if not isinstance(_hygiene_cfg, dict):
                        _hygiene_cfg = {}
            except Exception:
                pass
            _compress_token_threshold = int(
                _hygiene_cfg.get("auto_compress_tokens", 100_000)
            )
            _compress_msg_threshold = int(
                _hygiene_cfg.get("auto_compress_messages", 200)
            )
            _warn_token_threshold = int(
                _hygiene_cfg.get("warn_tokens", 200_000)
            )
            _msg_count = len(history)
            _approx_tokens = estimate_messages_tokens_rough(history)
            _needs_compress = (
                _approx_tokens >= _compress_token_threshold
                or _msg_count >= _compress_msg_threshold
            )
            if _needs_compress:
                logger.info(
                    "Session hygiene: %s messages, ~%s tokens — auto-compressing "
                    "(thresholds: %s msgs / %s tokens)",
                    _msg_count, f"{_approx_tokens:,}",
                    _compress_msg_threshold, f"{_compress_token_threshold:,}",
                )
                _hyg_adapter = self.adapters.get(source.platform)
                if _hyg_adapter:
                    try:
                        await _hyg_adapter.send(
                            source.chat_id,
                            f"🗜️ Session is large ({_msg_count} messages, "
                            f"~{_approx_tokens:,} tokens). Auto-compressing..."
                        )
                    except Exception:
                        pass
                try:
                    from run_agent import AIAgent
                    _hyg_runtime = _resolve_runtime_agent_kwargs()
                    if _hyg_runtime.get("api_key"):
                        _hyg_msgs = [
                            {"role": m.get("role"), "content": m.get("content")}
                            for m in history
                            if m.get("role") in ("user", "assistant")
                            and m.get("content")
                        ]
                        if len(_hyg_msgs) >= 4:
                            _hyg_agent = AIAgent(
                                **_hyg_runtime,
                                max_iterations=4,
                                quiet_mode=True,
                                enabled_toolsets=["memory"],
                                session_id=session_entry.session_id,
                            )
                            loop = asyncio.get_event_loop()
                            _compressed, _ = await loop.run_in_executor(
                                None,
                                lambda: _hyg_agent._compress_context(
                                    _hyg_msgs, "",
                                    approx_tokens=_approx_tokens,
                                ),
                            )
                            self.session_store.rewrite_transcript(
                                session_entry.session_id, _compressed
                            )
                            history = _compressed
                            _new_count = len(_compressed)
                            _new_tokens = estimate_messages_tokens_rough(
                                _compressed
                            )
                            logger.info(
                                "Session hygiene: compressed %s → %s msgs, "
                                "~%s → ~%s tokens",
                                _msg_count, _new_count,
                                f"{_approx_tokens:,}", f"{_new_tokens:,}",
                            )
                            if _hyg_adapter:
                                try:
                                    await _hyg_adapter.send(
                                        source.chat_id,
                                        f"🗜️ Compressed: {_msg_count} → "
                                        f"{_new_count} messages, "
                                        f"~{_approx_tokens:,} → "
                                        f"~{_new_tokens:,} tokens"
                                    )
                                except Exception:
                                    pass
                            # Still too large after compression — warn user
                            if _new_tokens >= _warn_token_threshold:
                                logger.warning(
                                    "Session hygiene: still ~%s tokens after "
                                    "compression — suggesting /reset",
                                    f"{_new_tokens:,}",
                                )
                                if _hyg_adapter:
                                    try:
                                        await _hyg_adapter.send(
                                            source.chat_id,
                                            "⚠️ Session is still very large "
                                            "after compression "
                                            f"(~{_new_tokens:,} tokens). "
                                            "Consider using /reset to start "
                                            "fresh if you experience issues."
                                        )
                                    except Exception:
                                        pass
                except Exception as e:
                    logger.warning(
                        "Session hygiene auto-compress failed: %s", e
                    )
                    # Compression failed and session is dangerously large
                    if _approx_tokens >= _warn_token_threshold:
                        _hyg_adapter = self.adapters.get(source.platform)
                        if _hyg_adapter:
                            try:
                                await _hyg_adapter.send(
                                    source.chat_id,
                                    f"⚠️ Session is very large "
                                    f"({_msg_count} messages, "
                                    f"~{_approx_tokens:,} tokens) and "
                                    "auto-compression failed. Consider "
                                    "using /compress or /reset to avoid "
                                    "issues."
                                )
                            except Exception:
                                pass
        # First-message onboarding -- only on the very first interaction ever
        if not history and not self.session_store.has_any_sessions():
            context_prompt += (
--- a/tests/gateway/test_session_hygiene.py
+++ b/tests/gateway/test_session_hygiene.py
@ -0,0 +1,159 @@
 """Tests for gateway session hygiene — auto-compression of large sessions.
 Verifies that the gateway detects pathologically large transcripts and
 triggers auto-compression before running the agent.  (#628)
 """
 import pytest
 from unittest.mock import patch, MagicMock, AsyncMock
 from agent.model_metadata import estimate_messages_tokens_rough
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _make_history(n_messages: int, content_size: int = 100) -> list:
    """Build a fake transcript with n_messages user/assistant pairs."""
    history = []
    content = "x" * content_size
    for i in range(n_messages):
        role = "user" if i % 2 == 0 else "assistant"
        history.append({"role": role, "content": content, "timestamp": f"t{i}"})
    return history
 def _make_large_history_tokens(target_tokens: int) -> list:
    """Build a history that estimates to roughly target_tokens tokens."""
    # estimate_messages_tokens_rough counts total chars in str(msg) // 4
    # Each msg dict has ~60 chars of overhead + content chars
    # So for N tokens we need roughly N * 4 total chars across all messages
    target_chars = target_tokens * 4
    # Each message as a dict string is roughly len(content) + 60 chars
    msg_overhead = 60
    # Use 50 messages with appropriately sized content
    n_msgs = 50
    content_size = max(10, (target_chars // n_msgs) - msg_overhead)
    return _make_history(n_msgs, content_size=content_size)
 # ---------------------------------------------------------------------------
 # Detection threshold tests
 # ---------------------------------------------------------------------------
 class TestSessionHygieneThresholds:
    """Test that the threshold logic correctly identifies large sessions."""
    def test_small_session_below_thresholds(self):
        """A 10-message session should not trigger compression."""
        history = _make_history(10)
        msg_count = len(history)
        approx_tokens = estimate_messages_tokens_rough(history)
        compress_token_threshold = 100_000
        compress_msg_threshold = 200
        needs_compress = (
            approx_tokens >= compress_token_threshold
            or msg_count >= compress_msg_threshold
        )
        assert not needs_compress
    def test_large_message_count_triggers(self):
        """200+ messages should trigger compression even if tokens are low."""
        history = _make_history(250, content_size=10)
        msg_count = len(history)
        compress_msg_threshold = 200
        needs_compress = msg_count >= compress_msg_threshold
        assert needs_compress
    def test_large_token_count_triggers(self):
        """High token count should trigger compression even if message count is low."""
        # 50 messages with huge content to exceed 100K tokens
        history = _make_history(50, content_size=10_000)
        approx_tokens = estimate_messages_tokens_rough(history)
        compress_token_threshold = 100_000
        needs_compress = approx_tokens >= compress_token_threshold
        assert needs_compress
    def test_under_both_thresholds_no_trigger(self):
        """Session under both thresholds should not trigger."""
        history = _make_history(100, content_size=100)
        msg_count = len(history)
        approx_tokens = estimate_messages_tokens_rough(history)
        compress_token_threshold = 100_000
        compress_msg_threshold = 200
        needs_compress = (
            approx_tokens >= compress_token_threshold
            or msg_count >= compress_msg_threshold
        )
        assert not needs_compress
    def test_custom_thresholds(self):
        """Custom thresholds from config should be respected."""
        history = _make_history(60, content_size=100)
        msg_count = len(history)
        # Custom lower threshold
        compress_msg_threshold = 50
        needs_compress = msg_count >= compress_msg_threshold
        assert needs_compress
        # Custom higher threshold
        compress_msg_threshold = 100
        needs_compress = msg_count >= compress_msg_threshold
        assert not needs_compress
    def test_minimum_message_guard(self):
        """Sessions with fewer than 4 messages should never trigger."""
        history = _make_history(3, content_size=100_000)
        # Even with enormous content, < 4 messages should be skipped
        # (the gateway code checks `len(history) >= 4` before evaluating)
        assert len(history) < 4
 class TestSessionHygieneWarnThreshold:
    """Test the post-compression warning threshold."""
    def test_warn_when_still_large(self):
        """If compressed result is still above warn_tokens, should warn."""
        # Simulate post-compression tokens
        warn_threshold = 200_000
        post_compress_tokens = 250_000
        assert post_compress_tokens >= warn_threshold
    def test_no_warn_when_under(self):
        """If compressed result is under warn_tokens, no warning."""
        warn_threshold = 200_000
        post_compress_tokens = 150_000
        assert post_compress_tokens < warn_threshold
 class TestTokenEstimation:
    """Verify rough token estimation works as expected for hygiene checks."""
    def test_empty_history(self):
        assert estimate_messages_tokens_rough([]) == 0
    def test_proportional_to_content(self):
        small = _make_history(10, content_size=100)
        large = _make_history(10, content_size=10_000)
        assert estimate_messages_tokens_rough(large) > estimate_messages_tokens_rough(small)
    def test_proportional_to_count(self):
        few = _make_history(10, content_size=1000)
        many = _make_history(100, content_size=1000)
        assert estimate_messages_tokens_rough(many) > estimate_messages_tokens_rough(few)
    def test_pathological_session_detected(self):
        """The reported pathological case: 648 messages, ~299K tokens."""
        # Simulate a 648-message session averaging ~460 tokens per message
        history = _make_history(648, content_size=1800)
        tokens = estimate_messages_tokens_rough(history)
        # Should be well above the 100K default threshold
        assert tokens > 100_000
        assert len(history) > 200
--- a/tools/code_execution_tool.py
+++ b/tools/code_execution_tool.py
@ -78,7 +78,7 @@ _TOOL_STUBS = {
    "web_extract": (
        "web_extract",
        "urls: list",
-        '"""Extract content from URLs. Returns dict with results list of {url, content, error}."""',
+        '"""Extract content from URLs. Returns dict with results list of {url, title, content, error}."""',
        '{"urls": urls}',
    ),
    "read_file": (
@ -605,7 +605,7 @@ _TOOL_DOC_LINES = [
     "    Returns {\"data\": {\"web\": [{\"url\", \"title\", \"description\"}, ...]}}"),
    ("web_extract",
     "  web_extract(urls: list[str]) -> dict\n"
-     "    Returns {\"results\": [{\"url\", \"content\", \"error\"}, ...]} where content is markdown"),
+     "    Returns {\"results\": [{\"url\", \"title\", \"content\", \"error\"}, ...]} where content is markdown"),
    ("read_file",
     "  read_file(path: str, offset: int = 1, limit: int = 500) -> dict\n"
     "    Lines are 1-indexed. Returns {\"content\": \"...\", \"total_lines\": N}"),
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@ -787,6 +787,7 @@ async def web_extract_tool(
        # Trim output to minimal fields per entry: title, content, error
        trimmed_results = [
            {
                "url": r.get("url", ""),
                "title": r.get("title", ""),
                "content": r.get("content", ""),
                "error": r.get("error"),